The bottom part of the figure shows the overall architecture. Given an input image and text instruction, (1) the VLM analyzes them via chain-of-thought reasoning (2) and produces region-aligned guidance, where each guidance includes a region bbox and its editing hint. (3) Each hint is further encoded by a text encoder into a feature token, while image patch tokens are obtained by VAE encoding and grouped according to the region bounding boxes. (4) A training-free group-specific attention mechanism is proposed to allow MMDiT to generate the final edited image. The top part of the figure presents an editing examples.
);
// 3. Benchmark
const Benchmark = () => (
Existing datasets mostly focus on subject-dominated images or simple instructions. IV-Edit is designed to test:
Benchmark Case Gallery (Large Image Placeholder)
Place 'benchmark_cases_large.png' here
{/* 3. Statistics Charts */}
Referring Chart Placeholder
{/* Chart 2: Task Types */}
Task Chart Placeholder
);
// 4. Experiments
const Experiments = () => {
const data = [
{
group: "proprietary",
model: "Gemini-Flash-Image",
quality: "3.89", target: "4.11", effect: "3.93", consistency: "2.89", overall: "3.71", weighted: "3.44"
},
{
group: "proprietary",
model: "GPT-4o",
quality: "3.61", target: "4.02", effect: "3.78", consistency: "1.77", overall: "3.30", weighted: "3.07"
},
{ divider: true },
{
group: "opensource",
model: "InstructPix2Pix",
quality: "2.47", target: "2.47", effect: "1.90", consistency: "1.40", overall: "2.06", weighted: "1.48"
},
{
group: "opensource",
model: "Uniworld-V1",
quality: "3.26", target: "2.89", effect: "2.18", consistency: "1.46", overall: "2.45", weighted: "1.84"
},
{
group: "opensource",
model: "Bagel-Think",
quality: "3.44", target: "3.47", effect: "2.93", consistency: "2.33", overall: "3.05", weighted: "2.46"
},
{ divider: true },
{
group: "opensource",
model: "Flux.1 Kontext dev",
quality: {val: "3.93", underline: true}, target: "3.34", effect: "2.73", consistency: "2.88", overall: "3.22", weighted: "2.49"
},
{
group: "ours",
model: "RePlan (Flux.1 Kontext dev)",
quality: {val: "4.16", bold: true}, target: "3.47", effect: "2.59", consistency: {val: "3.64", bold: true}, overall: {val: "3.46", underline: true}, weighted: "2.55",
highlightRow: true
},
{
group: "opensource",
model: "Qwen-Image-Edit",
quality: "3.47", target: {val: "3.72", underline: true}, effect: {val: "3.24", bold: true}, consistency: "1.79", overall: "3.05", weighted: {val: "2.62", underline: true}
},
{
group: "ours",
model: "RePlan (Qwen-Image-Edit)",
quality: "3.86", target: {val: "3.77", bold: true}, effect: {val: "3.16", underline: true}, consistency: {val: "3.24", underline: true}, overall: {val: "3.51", bold: true}, weighted: {val: "2.91", bold: true},
highlightRow: true
},
];
const renderCell = (content) => {
if (typeof content === 'object') {
return (
{content.val}
);
}
return content;
};
const extraImages = [
"assets/more_comparative/1.jpg",
"assets/more_comparative/3.jpg",
"assets/more_comparative/4.jpg",
"assets/more_comparative/2.jpg",
];
const [currentExtraIndex, setCurrentExtraIndex] = useState(0);
const [imgError, setImgError] = useState(false);
const nextImage = () => {
setCurrentExtraIndex((prev) => (prev + 1) % extraImages.length);
};
const prevImage = () => {
setCurrentExtraIndex((prev) => (prev - 1 + extraImages.length) % extraImages.length);
};
useEffect(() => {
setImgError(false);
}, [currentExtraIndex]);
return (
* Bold indicates the best performance, and underline indicates the second best among open-source models.
Top section (gray background) lists proprietary closed-source models for reference.
{/* Additional Qualitative Results Carousel */}
Result {currentExtraIndex + 1}
assets/extra_comparisons/comp{currentExtraIndex + 1}.png
)}
{/* Navigation Controls */}
Why a new benchmark?
{[
"Fine-grained visual referring",
"Multi-region editing",
"Knowledge-based reasoning",
"Local text editing",
].map((item, i) => (
{ e.target.style.display = 'none'; if (e.target.nextElementSibling) e.target.nextElementSibling.style.display = 'flex'; }}
/>
Referring Types
{ e.target.style.display = 'none'; if (e.target.nextElementSibling) e.target.nextElementSibling.style.display = 'flex'; }}
/>
Task Types
{ e.target.style.display = 'none'; if (e.target.nextElementSibling) e.target.nextElementSibling.style.display = 'flex'; }}
/>
{data.map((row, i) => {
if (row.divider) {
return (
Model
Quality ↑
Target ↑
Effect ↑
Consistency ↑
Overall ↑
Weighted ↑
);
}
return (
{row.model}
{renderCell(row.quality)}
{renderCell(row.target)}
{renderCell(row.effect)}
{renderCell(row.consistency)}
{renderCell(row.overall)}
{renderCell(row.weighted)}
);
})}
setImgError(true)}
/>
) : (