diff --git a/agent/conversation_compression.py b/agent/conversation_compression.py index ba8678cc7..a0b077e3e 100644 --- a/agent/conversation_compression.py +++ b/agent/conversation_compression.py @@ -644,6 +644,12 @@ def try_shrink_image_parts_in_messages(api_messages: list) -> bool: # after a confirmed provider rejection, so the alternative is failure. target_bytes = 4 * 1024 * 1024 changed_count = 0 + # Track parts that are over the target but could NOT be shrunk under it. + # If any survive, retrying is pointless — the same oversized payload will + # be re-sent and rejected again, wasting the single retry budget. We only + # report success (caller retries) when every over-threshold image was + # actually brought under the target. + unshrinkable_oversized = 0 def _shrink_data_url(url: str) -> Optional[str]: """Return a smaller data URL, or None if shrink can't help.""" @@ -710,17 +716,34 @@ def try_shrink_image_parts_in_messages(api_messages: list) -> bool: if resized: image_value["url"] = resized changed_count += 1 + elif isinstance(url, str) and url.startswith("data:") \ + and len(url) > target_bytes: + unshrinkable_oversized += 1 elif isinstance(image_value, str): resized = _shrink_data_url(image_value) if resized: part["image_url"] = resized changed_count += 1 + elif image_value.startswith("data:") \ + and len(image_value) > target_bytes: + unshrinkable_oversized += 1 if changed_count: logger.info( "image-shrink recovery: re-encoded %d image part(s) to fit under %.0f MB", changed_count, target_bytes / (1024 * 1024), ) + if unshrinkable_oversized: + # At least one oversized image could not be shrunk under the target. + # Retrying would re-send it and fail identically, so signal "no + # progress" even if other parts shrank — the caller will surface the + # original error rather than burning its single retry on a no-op. + logger.warning( + "image-shrink recovery: %d oversized image part(s) could not be " + "shrunk under %.0f MB — not retrying (would re-send rejected payload)", + unshrinkable_oversized, target_bytes / (1024 * 1024), + ) + return False return changed_count > 0 diff --git a/tests/run_agent/test_image_shrink_recovery.py b/tests/run_agent/test_image_shrink_recovery.py index c5114ffef..86a3e6abf 100644 --- a/tests/run_agent/test_image_shrink_recovery.py +++ b/tests/run_agent/test_image_shrink_recovery.py @@ -273,3 +273,51 @@ class TestShrinkImagePartsHelper: assert agent._try_shrink_image_parts_in_messages(msgs) is False # Original URL still in place, not replaced by the bigger one. assert msgs[0]["content"][0]["image_url"]["url"] == oversized_url + + def test_mixed_one_shrinkable_one_not_returns_false(self, monkeypatch): + """Regression for the wedged-session incident (May 2026). + + When one oversized image shrinks but another oversized image can't, + the helper must return False — retrying would re-send the surviving + oversized payload and fail identically, burning the single retry on a + no-op. The original bug returned True after shrinking *any* part, + which is what permanently wedged a session whose history held a 12 MB + tool-result image alongside a freshly-loaded shrinkable one. + """ + agent = _make_agent() + shrinkable = _big_png_data_url(5000) + unshrinkable = _big_png_data_url(6000) + small = "data:image/jpeg;base64," + "C" * 500 + + # _resize_image_for_vision returns small for the shrinkable input but + # echoes the oversized payload back for the unshrinkable one. + def fake_resize(path, *a, **kw): + # The temp file written by the helper contains the decoded bytes; + # distinguish by size — the 6000 KB source stays "big". + try: + size = path.stat().st_size + except Exception: + size = 0 + if size > 5500 * 1024: + return unshrinkable # can't reduce — echo oversized back + return small + + monkeypatch.setattr( + "tools.vision_tools._resize_image_for_vision", + fake_resize, + raising=False, + ) + + msgs = [{ + "role": "tool", + "content": [ + {"type": "image_url", "image_url": {"url": shrinkable}}, + {"type": "image_url", "image_url": {"url": unshrinkable}}, + ], + }] + # One part shrank, one survived oversized → must NOT retry. + assert agent._try_shrink_image_parts_in_messages(msgs) is False + # The shrinkable one was still re-encoded (mutated in place). + assert msgs[0]["content"][0]["image_url"]["url"] == small + # The unshrinkable one is left as-is (caller surfaces original error). + assert msgs[0]["content"][1]["image_url"]["url"] == unshrinkable diff --git a/tests/tools/test_vision_native_fast_path.py b/tests/tools/test_vision_native_fast_path.py index 9916ca369..bb396c05d 100644 --- a/tests/tools/test_vision_native_fast_path.py +++ b/tests/tools/test_vision_native_fast_path.py @@ -139,6 +139,44 @@ class TestVisionAnalyzeNative: assert isinstance(result, dict) assert result.get("_multimodal") is True + def test_oversized_image_resized_under_embed_cap(self, tmp_path): + """Regression for the wedged-session incident (May 2026). + + A vision tool-result image is baked into conversation history and + re-sent on every subsequent turn. Anthropic rejects any single + base64 image over 5 MB with a 400, and immutable history means the + bad bytes can't be cleared by retrying — the session is permanently + wedged. The native fast path must proactively resize down to the + embed cap (well under 5 MB) BEFORE embedding, not just at the 20 MB + hard ceiling. Skips if Pillow isn't available (resize is a no-op). + """ + pytest = __import__("pytest") + try: + from PIL import Image + except ImportError: + pytest.skip("Pillow not installed — proactive resize is a no-op") + + from tools.vision_tools import _EMBED_TARGET_BYTES + + # Noisy PNG that base64-encodes to well over 5 MB (won't compress much). + big = tmp_path / "big.png" + Image.effect_noise((2600, 2600), 80).convert("RGB").save(big, format="PNG") + assert big.stat().st_size * 4 // 3 > 5 * 1024 * 1024, "test image not big enough" + + result = asyncio.get_event_loop().run_until_complete( + _vision_analyze_native(str(big), "describe") + ) + assert isinstance(result, dict) and result.get("_multimodal") is True + url = next( + p["image_url"]["url"] + for p in result["content"] + if p.get("type") == "image_url" + ) + assert len(url) <= _EMBED_TARGET_BYTES, ( + f"embedded image {len(url) / 1024 / 1024:.1f} MB exceeds embed cap " + f"{_EMBED_TARGET_BYTES / 1024 / 1024:.0f} MB — would wedge sessions on Anthropic" + ) + # ─── _handle_vision_analyze fast-path gating ───────────────────────────────── diff --git a/tools/vision_tools.py b/tools/vision_tools.py index 23a0508fe..39a4921f1 100644 --- a/tools/vision_tools.py +++ b/tools/vision_tools.py @@ -311,10 +311,21 @@ def _image_to_base64_data_url(image_path: Path, mime_type: Optional[str] = None) return data_url -# Hard limit for vision API payloads (20 MB) — matches the most restrictive -# major provider (Gemini inline data limit). Images above this are rejected. +# Absolute hard ceiling for vision API payloads (20 MB) — above this, no major +# provider accepts the image and we reject outright. _MAX_BASE64_BYTES = 20 * 1024 * 1024 +# Proactive embed cap (4 MB). This is the size we resize an image DOWN to +# before embedding it into conversation history, regardless of the 20 MB hard +# ceiling. Anthropic's per-image base64 limit is 5 MB; once an oversized image +# is baked into history (e.g. a vision tool-result), it is re-sent on every +# subsequent turn and permanently wedges the session with a 400 that retries +# can't clear (the bad bytes are immutable history). Capping at embed time — +# with headroom under 5 MB — is the only durable fix. Matches the post-failure +# shrink target in agent.conversation_compression so behaviour is consistent +# whether we resize proactively or reactively. +_EMBED_TARGET_BYTES = 4 * 1024 * 1024 + # Target size when auto-resizing on API failure (5 MB). After a provider # rejects an image, we downscale to this target and retry once. _RESIZE_TARGET_BYTES = 5 * 1024 * 1024 @@ -656,11 +667,21 @@ async def _vision_analyze_native( temp_image_path, mime_type=detected_mime_type, ) - # Honour the same hard cap as the legacy path. Resize if needed. - if len(image_data_url) > _MAX_BASE64_BYTES: + # Proactive embed cap: this image gets baked into conversation + # history and re-sent on every subsequent turn. Anthropic rejects + # any single base64 image over 5 MB with a 400, and because history + # is immutable, an oversized embed permanently wedges the session — + # retries can't clear bytes that are already in the request. Resize + # DOWN to the embed target (4 MB, headroom under 5 MB) whenever the + # payload exceeds it, not just at the 20 MB hard ceiling. + if len(image_data_url) > _EMBED_TARGET_BYTES: image_data_url = _resize_image_for_vision( temp_image_path, mime_type=detected_mime_type, + max_base64_bytes=_EMBED_TARGET_BYTES, ) + # If even resizing can't get under the absolute hard ceiling, + # there's nothing more we can do — reject rather than embed a + # session-wedging payload. if len(image_data_url) > _MAX_BASE64_BYTES: return tool_error( f"Image too large for vision API: base64 payload is "