Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions lib/matplotlib/backends/backend_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2104,12 +2104,26 @@ def draw_path_collection(self, gc, master_transform, paths, all_transforms,

padding = np.max(linewidths)
path_codes = []

# Calculate maximum marker extent for conservative bounds checking.
# We need to account for marker size, not just position.
max_marker_extent = 0
for i, (path, transform) in enumerate(self._iter_collection_raw_paths(
master_transform, paths, all_transforms)):
if len(path.vertices):
# Get the bounding box of the transformed marker path.
# Use get_extents() which is more efficient than transforming
# all vertices, and add padding for stroke width.
bbox = path.get_extents(transform)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you test that this works as expected with log scale and polar?

max_marker_extent = max(max_marker_extent,
bbox.width / 2, bbox.height / 2)
name = self.file.pathCollectionObject(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this write something into the file our only update our Python side sate?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does not write into file immediately but seems it writes after all drawing is done. Let me fix this by creating path templates only for used path_ids and compute extents for creted paths.

gc, path, transform, padding, filled, stroked)
path_codes.append(name)

# Add padding for stroke width.
max_marker_extent += padding

output = self.file.output
output(*self.gc.push())
lastx, lasty = 0, 0
Expand All @@ -2118,6 +2132,15 @@ def draw_path_collection(self, gc, master_transform, paths, all_transforms,
facecolors, edgecolors, linewidths, linestyles,
antialiaseds, urls, offset_position, hatchcolors=hatchcolors):

# Skip markers outside visible canvas bounds to reduce PDF size.
# Add max_marker_extent margin to account for marker size - a marker
# may be partially visible even if its center is outside the canvas.
canvas_width = self.file.width * 72
canvas_height = self.file.height * 72
if not (-max_marker_extent <= xo <= canvas_width + max_marker_extent
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did you use a maximum extent rather than doing the computation here and having per-maker and per-direction filtering?

We have to compute the extent of every maker no matter what so might as well get the better behavior by doing it here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I initially used a single max_marker_extent to simplify the bounds check and avoid recalculating the extent for each marker.

Yes, you're right - per-marker filtering is more precise. I've updated the commit.

and -max_marker_extent <= yo <= canvas_height + max_marker_extent):
continue

self.check_gc(gc0, rgbFace)
dx, dy = xo - lastx, yo - lasty
output(1, 0, 0, 1, dx, dy, Op.concat_matrix, path_id,
Expand Down
109 changes: 109 additions & 0 deletions lib/matplotlib/tests/test_backend_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,3 +478,112 @@ def test_font_bitstream_charter():
ax.text(0.1, 0.3, r"fi ffl 1234", usetex=True, fontsize=50)
ax.set_xticks([])
ax.set_yticks([])


def test_scatter_offaxis_colored_pdf_size():
"""
Test that off-axis scatter plots with per-point colors don't bloat PDFs.

Regression test for issue #2488. When scatter points with per-point colors
are completely outside the visible axes, the PDF backend should skip
writing those markers to significantly reduce file size.
"""
# Use John Hunter's birthday as random seed for reproducibility
rng = np.random.default_rng(19680801)

n_points = 1000
x = rng.random(n_points) * 10
y = rng.random(n_points) * 10
c = rng.random(n_points)

# Test 1: Scatter with per-point colors, all points OFF-AXIS
fig1, ax1 = plt.subplots()
ax1.scatter(x, y, c=c)
ax1.set_xlim(20, 30) # Move view completely away from data (x is 0-10)
ax1.set_ylim(20, 30) # Move view completely away from data (y is 0-10)

buf1 = io.BytesIO()
fig1.savefig(buf1, format='pdf')
size_offaxis_colored = buf1.tell()
plt.close(fig1)

# Test 2: Empty scatter (baseline - smallest possible)
fig2, ax2 = plt.subplots()
ax2.set_xlim(20, 30)
ax2.set_ylim(20, 30)

buf2 = io.BytesIO()
fig2.savefig(buf2, format='pdf')
size_empty = buf2.tell()
plt.close(fig2)

# The off-axis colored scatter should be close to empty size
# Allow up to 50KB overhead for axes/metadata, but should be much smaller
# than if all 1000 markers were written (which would add ~200-400KB)
assert size_offaxis_colored < size_empty + 50_000, (
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The 50kb padding is confusing to me. The axes should be the same output either way (as the limits are the same etc). If there was anything extra I would expect it to be the structures for the scatter that has no paths (there are some open/close group stuff emitted if I recall correctly). To account for that we could ad ax2.scatter([], []) to the empty test.

It is probably worth also adding a third example where there are markers in in the plot (ax3.scatter(x+20, y+20, c=c)) to test that fully off axis one is about the same size as the empty one and both are smaller than the one with visible markers.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've updated the tests per your suggestions:

  1. Added ax2.scatter([], []) to the baseline test to match the axes structure exactly
  2. Reduced tolerance from 50KB to 5KB since axes output is now identical
  3. Added a third test case with visible markers (x + 20, y + 20, c=c) that validates:
    • Visible scatter is >50KB larger than empty
    • Visible scatter is >50KB larger than off-axis

Now, we can compare all three outputs:

  • fully off-axis scatter --> small file
  • empty scatter --> small file
  • visible scatter --> larger file

f"Off-axis colored scatter PDF ({size_offaxis_colored} bytes) is too large. "
f"Expected close to empty figure size ({size_empty} bytes). "
f"Markers may not be properly skipped."
)


@check_figures_equal(extensions=["pdf"])
def test_scatter_offaxis_colored_visual(fig_test, fig_ref):
"""
Test that on-axis scatter with per-point colors still renders correctly.

Ensures the optimization for off-axis markers doesn't break normal
scatter rendering.
"""
rng = np.random.default_rng(19680801)

n_points = 100
x = rng.random(n_points) * 5
y = rng.random(n_points) * 5
c = rng.random(n_points)

# Test figure: scatter with clipping optimization
ax_test = fig_test.subplots()
ax_test.scatter(x, y, c=c, s=50)
ax_test.set_xlim(0, 10)
ax_test.set_ylim(0, 10)

# Reference figure: should look identical
ax_ref = fig_ref.subplots()
ax_ref.scatter(x, y, c=c, s=50)
ax_ref.set_xlim(0, 10)
ax_ref.set_ylim(0, 10)


@check_figures_equal(extensions=["pdf"])
def test_scatter_mixed_onoff_axis(fig_test, fig_ref):
"""
Test scatter with some points on-axis and some off-axis.

Ensures the optimization correctly handles the common case where only
some markers are outside the visible area.
"""
rng = np.random.default_rng(19680801)

# Create points: half on-axis (0-5), half off-axis (15-20)
n_points = 50
x_on = rng.random(n_points) * 5
y_on = rng.random(n_points) * 5
x_off = rng.random(n_points) * 5 + 15
y_off = rng.random(n_points) * 5 + 15

x = np.concatenate([x_on, x_off])
y = np.concatenate([y_on, y_off])
c = rng.random(2 * n_points)

# Test figure: scatter with mixed points
ax_test = fig_test.subplots()
ax_test.scatter(x, y, c=c, s=50)
ax_test.set_xlim(0, 10)
ax_test.set_ylim(0, 10)

# Reference figure: only the on-axis points should be visible
ax_ref = fig_ref.subplots()
ax_ref.scatter(x_on, y_on, c=c[:n_points], s=50)
ax_ref.set_xlim(0, 10)
ax_ref.set_ylim(0, 10)
Loading