fix(vision): guard image pixel dimensions, not just bytes (#37677)

Anthropic enforces two independent ceilings per image: 1. 5 MB encoded byte size 2. 8000 px longest side Hermes only guarded #1. A tall screenshot (e.g. 1200x12000 at 0.06 MB) passes every byte check but fails the pixel check, returning a non-retryable HTTP 400 that permanently bricks the conversation thread. Fixes: - error_classifier: add 'image dimensions exceed' pattern to _IMAGE_TOO_LARGE_PATTERNS so the 400 is classified as image_too_large and triggers the shrink/retry path instead of falling through to non-retryable error. - conversation_compression: check pixel dimensions (via Pillow) even when byte size is under the 4 MB target. If max(dims) > 8000, force shrink. - vision_tools._resize_image_for_vision: add optional max_dimension param. When set, images exceeding the pixel cap are downscaled even if they're under the byte budget. The resize loop now checks both byte AND pixel limits before accepting a candidate. Closes #37677
2026-06-02 23:45:22 +00:00
parent f7dabd3019
commit 6bdbe30763
4 changed files with 65 additions and 10 deletions
--- a/agent/conversation_compression.py
+++ b/agent/conversation_compression.py
@ -646,6 +646,11 @@ def try_shrink_image_parts_in_messages(api_messages: list) -> bool:
    # much larger; shrinking to 4 MB here loses quality but only fires
    # after a confirmed provider rejection, so the alternative is failure.
    target_bytes = 4 * 1024 * 1024
+    # Anthropic enforces an 8000px per-side dimension cap independently of
+    # the 5 MB byte cap.  A tall screenshot can be well under 5 MB yet far
+    # over 8000px (e.g. 1200×12000 at 0.06 MB).  We check pixel dimensions
+    # even when the byte budget is fine.
+    max_dimension = 8000
    changed_count = 0
    # Track parts that are over the target but could NOT be shrunk under it.
    # If any survive, retrying is pointless — the same oversized payload will
@ -658,9 +663,30 @@ def try_shrink_image_parts_in_messages(api_messages: list) -> bool:
        """Return a smaller data URL, or None if shrink can't help."""
        if not isinstance(url, str) or not url.startswith("data:"):
            return None
-        if len(url) <= target_bytes:
-            # This specific image wasn't the oversized one.
+
+        # Check both byte size AND pixel dimensions.
+        needs_shrink = len(url) > target_bytes  # over byte budget
+        if not needs_shrink:
+            # Even if bytes are fine, check pixel dimensions against
+            # Anthropic's 8000px cap.  A tall image can be tiny in bytes
+            # yet huge in pixels.
+            try:
+                import base64 as _b64_dim
+                header_d, _, data_d = url.partition(",")
+                if not data_d:
                    return None
+                raw_d = _b64_dim.b64decode(data_d)
+                from PIL import Image as _PILImage
+                import io as _io_dim
+                with _PILImage.open(_io_dim.BytesIO(raw_d)) as _img:
+                    if max(_img.size) <= max_dimension:
+                        return None  # both bytes and pixels are fine
+                needs_shrink = True  # pixels exceed limit, force shrink
+            except Exception:
+                # If we can't check dimensions (Pillow unavailable, corrupt
+                # image, etc.), fall back to byte-only check.
+                return None
+
        try:
            header, _, data = url.partition(",")
            mime = "image/jpeg"
@ -684,6 +710,7 @@ def try_shrink_image_parts_in_messages(api_messages: list) -> bool:
                    Path(tmp.name),
                    mime_type=mime,
                    max_base64_bytes=target_bytes,
+                    max_dimension=max_dimension,
                )
            finally:
                try:
--- a/agent/error_classifier.py
+++ b/agent/error_classifier.py
@ -171,6 +171,7 @@ _IMAGE_TOO_LARGE_PATTERNS = [
    "image too large",      # generic
    "image_too_large",      # error_code variant
    "image size exceeds",   # variant
+    "image dimensions exceed",  # Anthropic: "image dimensions exceed max allowed size: 8000 pixels"
    # "request_too_large" on a request known to contain an image → image is
    # the likely culprit; we still try the shrink path before giving up.
 ]
--- a/tests/run_agent/test_image_shrink_recovery.py
+++ b/tests/run_agent/test_image_shrink_recovery.py
@ -143,7 +143,7 @@ class TestShrinkImagePartsHelper:
        oversized_url = _big_png_data_url(5000)  # ~5 MB raw → ~6.7 MB b64
        shrunk = "data:image/jpeg;base64," + "A" * 1000  # small

-        def _fake_resize(path, mime_type=None, max_base64_bytes=None):
+        def _fake_resize(path, mime_type=None, max_base64_bytes=None, max_dimension=None):
            return shrunk

        monkeypatch.setattr(
--- a/tools/vision_tools.py
+++ b/tools/vision_tools.py
@ -342,20 +342,40 @@ def _is_image_size_error(error: Exception) -> bool:


 def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
-                              max_base64_bytes: int = _RESIZE_TARGET_BYTES) -> str:
+                              max_base64_bytes: int = _RESIZE_TARGET_BYTES,
+                              max_dimension: Optional[int] = None) -> str:
    """Convert an image to a base64 data URL, auto-resizing if too large.

    Tries Pillow first to progressively downscale oversized images.  If Pillow
    is not installed or resizing still exceeds the limit, falls back to the raw
    bytes and lets the caller handle the size check.

+    Args:
+        max_dimension: If set, images whose longest side exceeds this pixel
+            count are forcibly downscaled even if they're under the byte
+            budget.  Anthropic enforces an 8000 px per-side cap independently
+            of the 5 MB byte cap.
+
    Returns the base64 data URL string.
    """
    # Quick file-size estimate: base64 expands by ~4/3, plus data URL header.
    # Skip the expensive full-read + encode if Pillow can resize directly.
    file_size = image_path.stat().st_size
    estimated_b64 = (file_size * 4) // 3 + 100  # ~header overhead
-    if estimated_b64 <= max_base64_bytes:
+    needs_resize_for_bytes = estimated_b64 > max_base64_bytes
+
+    # Check pixel dimensions even if bytes are fine.
+    needs_resize_for_dims = False
+    if max_dimension is not None:
+        try:
+            from PIL import Image as _PILQuick
+            with _PILQuick.open(image_path) as _quick_img:
+                if max(_quick_img.size) > max_dimension:
+                    needs_resize_for_dims = True
+        except Exception:
+            pass  # can't check; Pillow path below will handle or skip
+
+    if not needs_resize_for_bytes and not needs_resize_for_dims:
        # Small enough — just encode directly.
        data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
        if len(data_url) <= max_base64_bytes:
@ -373,9 +393,9 @@ def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
            data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
        return data_url  # caller will raise the size error

-    logger.info("Image file is %.1f MB (estimated base64 %.1f MB, limit %.1f MB), auto-resizing...",
+    logger.info("Image file is %.1f MB (estimated base64 %.1f MB, limit %.1f MB, max_dimension=%s), auto-resizing...",
                file_size / (1024 * 1024), estimated_b64 / (1024 * 1024),
-                max_base64_bytes / (1024 * 1024))
+                max_base64_bytes / (1024 * 1024), max_dimension)

    mime = mime_type or _determine_mime_type(image_path)
    # Choose output format: JPEG for photos (smaller), PNG for transparency
@ -393,13 +413,20 @@ def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
    if pil_format == "JPEG" and img.mode in {"RGBA", "P"}:
        img = img.convert("RGB")

-    # Strategy: halve dimensions until base64 fits, up to 4 rounds.
+    # Strategy: halve dimensions until both base64 fits AND pixel dimensions
+    # are within limits, up to 4 rounds.
    # For JPEG, also try reducing quality at each size step.
    # For PNG, quality is irrelevant — only dimension reduction helps.
    quality_steps = (85, 70, 50) if pil_format == "JPEG" else (None,)
    prev_dims = (img.width, img.height)
    candidate = None  # will be set on first loop iteration

+    def _dims_ok(w: int, h: int) -> bool:
+        """True if both pixel dimensions are within the limit."""
+        if max_dimension is None:
+            return True
+        return max(w, h) <= max_dimension
+
    for attempt in range(5):
        if attempt > 0:
            # Proportional scaling: halve the longer side and scale the
@ -430,7 +457,7 @@ def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
            img.save(buf, **save_kwargs)
            encoded = base64.b64encode(buf.getvalue()).decode("ascii")
            candidate = f"data:{out_mime};base64,{encoded}"
-            if len(candidate) <= max_base64_bytes:
+            if len(candidate) <= max_base64_bytes and _dims_ok(img.width, img.height):
                logger.info("Auto-resized image fits: %.1f MB (quality=%s, %dx%d)",
                            len(candidate) / (1024 * 1024), q,
                            img.width, img.height)