fix(vision): cap embedded image size before it wedges a session (#35732)
Resize vision tool-result images down to a 4 MB embed cap at load time, not just at the 20 MB hard ceiling. A 5-20 MB image previously sailed through the native fast path and got baked into conversation history, where Anthropic's 5 MB per-image base64 limit rejected every subsequent turn with a 400 — and because history is immutable, retries could never clear it, permanently wedging the session. Also harden the reactive shrink-recovery: it now returns False (don't retry) when any oversized image part can't be brought under target, so the single retry isn't burned re-sending a payload that will fail identically. Previously it returned True after shrinking *any* part, even when the actual oversized culprit survived.
This commit is contained in:
@ -644,6 +644,12 @@ def try_shrink_image_parts_in_messages(api_messages: list) -> bool:
|
||||
# after a confirmed provider rejection, so the alternative is failure.
|
||||
target_bytes = 4 * 1024 * 1024
|
||||
changed_count = 0
|
||||
# Track parts that are over the target but could NOT be shrunk under it.
|
||||
# If any survive, retrying is pointless — the same oversized payload will
|
||||
# be re-sent and rejected again, wasting the single retry budget. We only
|
||||
# report success (caller retries) when every over-threshold image was
|
||||
# actually brought under the target.
|
||||
unshrinkable_oversized = 0
|
||||
|
||||
def _shrink_data_url(url: str) -> Optional[str]:
|
||||
"""Return a smaller data URL, or None if shrink can't help."""
|
||||
@ -710,17 +716,34 @@ def try_shrink_image_parts_in_messages(api_messages: list) -> bool:
|
||||
if resized:
|
||||
image_value["url"] = resized
|
||||
changed_count += 1
|
||||
elif isinstance(url, str) and url.startswith("data:") \
|
||||
and len(url) > target_bytes:
|
||||
unshrinkable_oversized += 1
|
||||
elif isinstance(image_value, str):
|
||||
resized = _shrink_data_url(image_value)
|
||||
if resized:
|
||||
part["image_url"] = resized
|
||||
changed_count += 1
|
||||
elif image_value.startswith("data:") \
|
||||
and len(image_value) > target_bytes:
|
||||
unshrinkable_oversized += 1
|
||||
|
||||
if changed_count:
|
||||
logger.info(
|
||||
"image-shrink recovery: re-encoded %d image part(s) to fit under %.0f MB",
|
||||
changed_count, target_bytes / (1024 * 1024),
|
||||
)
|
||||
if unshrinkable_oversized:
|
||||
# At least one oversized image could not be shrunk under the target.
|
||||
# Retrying would re-send it and fail identically, so signal "no
|
||||
# progress" even if other parts shrank — the caller will surface the
|
||||
# original error rather than burning its single retry on a no-op.
|
||||
logger.warning(
|
||||
"image-shrink recovery: %d oversized image part(s) could not be "
|
||||
"shrunk under %.0f MB — not retrying (would re-send rejected payload)",
|
||||
unshrinkable_oversized, target_bytes / (1024 * 1024),
|
||||
)
|
||||
return False
|
||||
return changed_count > 0
|
||||
|
||||
|
||||
|
||||
@ -273,3 +273,51 @@ class TestShrinkImagePartsHelper:
|
||||
assert agent._try_shrink_image_parts_in_messages(msgs) is False
|
||||
# Original URL still in place, not replaced by the bigger one.
|
||||
assert msgs[0]["content"][0]["image_url"]["url"] == oversized_url
|
||||
|
||||
def test_mixed_one_shrinkable_one_not_returns_false(self, monkeypatch):
|
||||
"""Regression for the wedged-session incident (May 2026).
|
||||
|
||||
When one oversized image shrinks but another oversized image can't,
|
||||
the helper must return False — retrying would re-send the surviving
|
||||
oversized payload and fail identically, burning the single retry on a
|
||||
no-op. The original bug returned True after shrinking *any* part,
|
||||
which is what permanently wedged a session whose history held a 12 MB
|
||||
tool-result image alongside a freshly-loaded shrinkable one.
|
||||
"""
|
||||
agent = _make_agent()
|
||||
shrinkable = _big_png_data_url(5000)
|
||||
unshrinkable = _big_png_data_url(6000)
|
||||
small = "data:image/jpeg;base64," + "C" * 500
|
||||
|
||||
# _resize_image_for_vision returns small for the shrinkable input but
|
||||
# echoes the oversized payload back for the unshrinkable one.
|
||||
def fake_resize(path, *a, **kw):
|
||||
# The temp file written by the helper contains the decoded bytes;
|
||||
# distinguish by size — the 6000 KB source stays "big".
|
||||
try:
|
||||
size = path.stat().st_size
|
||||
except Exception:
|
||||
size = 0
|
||||
if size > 5500 * 1024:
|
||||
return unshrinkable # can't reduce — echo oversized back
|
||||
return small
|
||||
|
||||
monkeypatch.setattr(
|
||||
"tools.vision_tools._resize_image_for_vision",
|
||||
fake_resize,
|
||||
raising=False,
|
||||
)
|
||||
|
||||
msgs = [{
|
||||
"role": "tool",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": shrinkable}},
|
||||
{"type": "image_url", "image_url": {"url": unshrinkable}},
|
||||
],
|
||||
}]
|
||||
# One part shrank, one survived oversized → must NOT retry.
|
||||
assert agent._try_shrink_image_parts_in_messages(msgs) is False
|
||||
# The shrinkable one was still re-encoded (mutated in place).
|
||||
assert msgs[0]["content"][0]["image_url"]["url"] == small
|
||||
# The unshrinkable one is left as-is (caller surfaces original error).
|
||||
assert msgs[0]["content"][1]["image_url"]["url"] == unshrinkable
|
||||
|
||||
@ -139,6 +139,44 @@ class TestVisionAnalyzeNative:
|
||||
assert isinstance(result, dict)
|
||||
assert result.get("_multimodal") is True
|
||||
|
||||
def test_oversized_image_resized_under_embed_cap(self, tmp_path):
|
||||
"""Regression for the wedged-session incident (May 2026).
|
||||
|
||||
A vision tool-result image is baked into conversation history and
|
||||
re-sent on every subsequent turn. Anthropic rejects any single
|
||||
base64 image over 5 MB with a 400, and immutable history means the
|
||||
bad bytes can't be cleared by retrying — the session is permanently
|
||||
wedged. The native fast path must proactively resize down to the
|
||||
embed cap (well under 5 MB) BEFORE embedding, not just at the 20 MB
|
||||
hard ceiling. Skips if Pillow isn't available (resize is a no-op).
|
||||
"""
|
||||
pytest = __import__("pytest")
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
pytest.skip("Pillow not installed — proactive resize is a no-op")
|
||||
|
||||
from tools.vision_tools import _EMBED_TARGET_BYTES
|
||||
|
||||
# Noisy PNG that base64-encodes to well over 5 MB (won't compress much).
|
||||
big = tmp_path / "big.png"
|
||||
Image.effect_noise((2600, 2600), 80).convert("RGB").save(big, format="PNG")
|
||||
assert big.stat().st_size * 4 // 3 > 5 * 1024 * 1024, "test image not big enough"
|
||||
|
||||
result = asyncio.get_event_loop().run_until_complete(
|
||||
_vision_analyze_native(str(big), "describe")
|
||||
)
|
||||
assert isinstance(result, dict) and result.get("_multimodal") is True
|
||||
url = next(
|
||||
p["image_url"]["url"]
|
||||
for p in result["content"]
|
||||
if p.get("type") == "image_url"
|
||||
)
|
||||
assert len(url) <= _EMBED_TARGET_BYTES, (
|
||||
f"embedded image {len(url) / 1024 / 1024:.1f} MB exceeds embed cap "
|
||||
f"{_EMBED_TARGET_BYTES / 1024 / 1024:.0f} MB — would wedge sessions on Anthropic"
|
||||
)
|
||||
|
||||
|
||||
# ─── _handle_vision_analyze fast-path gating ─────────────────────────────────
|
||||
|
||||
|
||||
@ -311,10 +311,21 @@ def _image_to_base64_data_url(image_path: Path, mime_type: Optional[str] = None)
|
||||
return data_url
|
||||
|
||||
|
||||
# Hard limit for vision API payloads (20 MB) — matches the most restrictive
|
||||
# major provider (Gemini inline data limit). Images above this are rejected.
|
||||
# Absolute hard ceiling for vision API payloads (20 MB) — above this, no major
|
||||
# provider accepts the image and we reject outright.
|
||||
_MAX_BASE64_BYTES = 20 * 1024 * 1024
|
||||
|
||||
# Proactive embed cap (4 MB). This is the size we resize an image DOWN to
|
||||
# before embedding it into conversation history, regardless of the 20 MB hard
|
||||
# ceiling. Anthropic's per-image base64 limit is 5 MB; once an oversized image
|
||||
# is baked into history (e.g. a vision tool-result), it is re-sent on every
|
||||
# subsequent turn and permanently wedges the session with a 400 that retries
|
||||
# can't clear (the bad bytes are immutable history). Capping at embed time —
|
||||
# with headroom under 5 MB — is the only durable fix. Matches the post-failure
|
||||
# shrink target in agent.conversation_compression so behaviour is consistent
|
||||
# whether we resize proactively or reactively.
|
||||
_EMBED_TARGET_BYTES = 4 * 1024 * 1024
|
||||
|
||||
# Target size when auto-resizing on API failure (5 MB). After a provider
|
||||
# rejects an image, we downscale to this target and retry once.
|
||||
_RESIZE_TARGET_BYTES = 5 * 1024 * 1024
|
||||
@ -656,11 +667,21 @@ async def _vision_analyze_native(
|
||||
temp_image_path, mime_type=detected_mime_type,
|
||||
)
|
||||
|
||||
# Honour the same hard cap as the legacy path. Resize if needed.
|
||||
if len(image_data_url) > _MAX_BASE64_BYTES:
|
||||
# Proactive embed cap: this image gets baked into conversation
|
||||
# history and re-sent on every subsequent turn. Anthropic rejects
|
||||
# any single base64 image over 5 MB with a 400, and because history
|
||||
# is immutable, an oversized embed permanently wedges the session —
|
||||
# retries can't clear bytes that are already in the request. Resize
|
||||
# DOWN to the embed target (4 MB, headroom under 5 MB) whenever the
|
||||
# payload exceeds it, not just at the 20 MB hard ceiling.
|
||||
if len(image_data_url) > _EMBED_TARGET_BYTES:
|
||||
image_data_url = _resize_image_for_vision(
|
||||
temp_image_path, mime_type=detected_mime_type,
|
||||
max_base64_bytes=_EMBED_TARGET_BYTES,
|
||||
)
|
||||
# If even resizing can't get under the absolute hard ceiling,
|
||||
# there's nothing more we can do — reject rather than embed a
|
||||
# session-wedging payload.
|
||||
if len(image_data_url) > _MAX_BASE64_BYTES:
|
||||
return tool_error(
|
||||
f"Image too large for vision API: base64 payload is "
|
||||
|
||||
Reference in New Issue
Block a user