diff --git a/agent/conversation_compression.py b/agent/conversation_compression.py index 64a5a6278..06257ffd2 100644 --- a/agent/conversation_compression.py +++ b/agent/conversation_compression.py @@ -646,6 +646,11 @@ def try_shrink_image_parts_in_messages(api_messages: list) -> bool: # much larger; shrinking to 4 MB here loses quality but only fires # after a confirmed provider rejection, so the alternative is failure. target_bytes = 4 * 1024 * 1024 + # Anthropic enforces an 8000px per-side dimension cap independently of + # the 5 MB byte cap. A tall screenshot can be well under 5 MB yet far + # over 8000px (e.g. 1200×12000 at 0.06 MB). We check pixel dimensions + # even when the byte budget is fine. + max_dimension = 8000 changed_count = 0 # Track parts that are over the target but could NOT be shrunk under it. # If any survive, retrying is pointless — the same oversized payload will @@ -658,9 +663,30 @@ def try_shrink_image_parts_in_messages(api_messages: list) -> bool: """Return a smaller data URL, or None if shrink can't help.""" if not isinstance(url, str) or not url.startswith("data:"): return None - if len(url) <= target_bytes: - # This specific image wasn't the oversized one. - return None + + # Check both byte size AND pixel dimensions. + needs_shrink = len(url) > target_bytes # over byte budget + if not needs_shrink: + # Even if bytes are fine, check pixel dimensions against + # Anthropic's 8000px cap. A tall image can be tiny in bytes + # yet huge in pixels. + try: + import base64 as _b64_dim + header_d, _, data_d = url.partition(",") + if not data_d: + return None + raw_d = _b64_dim.b64decode(data_d) + from PIL import Image as _PILImage + import io as _io_dim + with _PILImage.open(_io_dim.BytesIO(raw_d)) as _img: + if max(_img.size) <= max_dimension: + return None # both bytes and pixels are fine + needs_shrink = True # pixels exceed limit, force shrink + except Exception: + # If we can't check dimensions (Pillow unavailable, corrupt + # image, etc.), fall back to byte-only check. + return None + try: header, _, data = url.partition(",") mime = "image/jpeg" @@ -684,6 +710,7 @@ def try_shrink_image_parts_in_messages(api_messages: list) -> bool: Path(tmp.name), mime_type=mime, max_base64_bytes=target_bytes, + max_dimension=max_dimension, ) finally: try: diff --git a/agent/error_classifier.py b/agent/error_classifier.py index e8a44866b..56fbc188a 100644 --- a/agent/error_classifier.py +++ b/agent/error_classifier.py @@ -171,6 +171,7 @@ _IMAGE_TOO_LARGE_PATTERNS = [ "image too large", # generic "image_too_large", # error_code variant "image size exceeds", # variant + "image dimensions exceed", # Anthropic: "image dimensions exceed max allowed size: 8000 pixels" # "request_too_large" on a request known to contain an image → image is # the likely culprit; we still try the shrink path before giving up. ] diff --git a/tests/run_agent/test_image_shrink_recovery.py b/tests/run_agent/test_image_shrink_recovery.py index 86a3e6abf..b707f1d92 100644 --- a/tests/run_agent/test_image_shrink_recovery.py +++ b/tests/run_agent/test_image_shrink_recovery.py @@ -143,7 +143,7 @@ class TestShrinkImagePartsHelper: oversized_url = _big_png_data_url(5000) # ~5 MB raw → ~6.7 MB b64 shrunk = "data:image/jpeg;base64," + "A" * 1000 # small - def _fake_resize(path, mime_type=None, max_base64_bytes=None): + def _fake_resize(path, mime_type=None, max_base64_bytes=None, max_dimension=None): return shrunk monkeypatch.setattr( diff --git a/tools/vision_tools.py b/tools/vision_tools.py index 39a4921f1..2576752a4 100644 --- a/tools/vision_tools.py +++ b/tools/vision_tools.py @@ -342,20 +342,40 @@ def _is_image_size_error(error: Exception) -> bool: def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None, - max_base64_bytes: int = _RESIZE_TARGET_BYTES) -> str: + max_base64_bytes: int = _RESIZE_TARGET_BYTES, + max_dimension: Optional[int] = None) -> str: """Convert an image to a base64 data URL, auto-resizing if too large. Tries Pillow first to progressively downscale oversized images. If Pillow is not installed or resizing still exceeds the limit, falls back to the raw bytes and lets the caller handle the size check. + Args: + max_dimension: If set, images whose longest side exceeds this pixel + count are forcibly downscaled even if they're under the byte + budget. Anthropic enforces an 8000 px per-side cap independently + of the 5 MB byte cap. + Returns the base64 data URL string. """ # Quick file-size estimate: base64 expands by ~4/3, plus data URL header. # Skip the expensive full-read + encode if Pillow can resize directly. file_size = image_path.stat().st_size estimated_b64 = (file_size * 4) // 3 + 100 # ~header overhead - if estimated_b64 <= max_base64_bytes: + needs_resize_for_bytes = estimated_b64 > max_base64_bytes + + # Check pixel dimensions even if bytes are fine. + needs_resize_for_dims = False + if max_dimension is not None: + try: + from PIL import Image as _PILQuick + with _PILQuick.open(image_path) as _quick_img: + if max(_quick_img.size) > max_dimension: + needs_resize_for_dims = True + except Exception: + pass # can't check; Pillow path below will handle or skip + + if not needs_resize_for_bytes and not needs_resize_for_dims: # Small enough — just encode directly. data_url = _image_to_base64_data_url(image_path, mime_type=mime_type) if len(data_url) <= max_base64_bytes: @@ -373,9 +393,9 @@ def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None, data_url = _image_to_base64_data_url(image_path, mime_type=mime_type) return data_url # caller will raise the size error - logger.info("Image file is %.1f MB (estimated base64 %.1f MB, limit %.1f MB), auto-resizing...", + logger.info("Image file is %.1f MB (estimated base64 %.1f MB, limit %.1f MB, max_dimension=%s), auto-resizing...", file_size / (1024 * 1024), estimated_b64 / (1024 * 1024), - max_base64_bytes / (1024 * 1024)) + max_base64_bytes / (1024 * 1024), max_dimension) mime = mime_type or _determine_mime_type(image_path) # Choose output format: JPEG for photos (smaller), PNG for transparency @@ -393,13 +413,20 @@ def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None, if pil_format == "JPEG" and img.mode in {"RGBA", "P"}: img = img.convert("RGB") - # Strategy: halve dimensions until base64 fits, up to 4 rounds. + # Strategy: halve dimensions until both base64 fits AND pixel dimensions + # are within limits, up to 4 rounds. # For JPEG, also try reducing quality at each size step. # For PNG, quality is irrelevant — only dimension reduction helps. quality_steps = (85, 70, 50) if pil_format == "JPEG" else (None,) prev_dims = (img.width, img.height) candidate = None # will be set on first loop iteration + def _dims_ok(w: int, h: int) -> bool: + """True if both pixel dimensions are within the limit.""" + if max_dimension is None: + return True + return max(w, h) <= max_dimension + for attempt in range(5): if attempt > 0: # Proportional scaling: halve the longer side and scale the @@ -430,7 +457,7 @@ def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None, img.save(buf, **save_kwargs) encoded = base64.b64encode(buf.getvalue()).decode("ascii") candidate = f"data:{out_mime};base64,{encoded}" - if len(candidate) <= max_base64_bytes: + if len(candidate) <= max_base64_bytes and _dims_ok(img.width, img.height): logger.info("Auto-resized image fits: %.1f MB (quality=%s, %dx%d)", len(candidate) / (1024 * 1024), q, img.width, img.height)