fix(vision): guard image pixel dimensions, not just bytes (#37677)
Anthropic enforces two independent ceilings per image: 1. 5 MB encoded byte size 2. 8000 px longest side Hermes only guarded #1. A tall screenshot (e.g. 1200x12000 at 0.06 MB) passes every byte check but fails the pixel check, returning a non-retryable HTTP 400 that permanently bricks the conversation thread. Fixes: - error_classifier: add 'image dimensions exceed' pattern to _IMAGE_TOO_LARGE_PATTERNS so the 400 is classified as image_too_large and triggers the shrink/retry path instead of falling through to non-retryable error. - conversation_compression: check pixel dimensions (via Pillow) even when byte size is under the 4 MB target. If max(dims) > 8000, force shrink. - vision_tools._resize_image_for_vision: add optional max_dimension param. When set, images exceeding the pixel cap are downscaled even if they're under the byte budget. The resize loop now checks both byte AND pixel limits before accepting a candidate. Closes #37677
This commit is contained in:
@ -646,6 +646,11 @@ def try_shrink_image_parts_in_messages(api_messages: list) -> bool:
|
||||
# much larger; shrinking to 4 MB here loses quality but only fires
|
||||
# after a confirmed provider rejection, so the alternative is failure.
|
||||
target_bytes = 4 * 1024 * 1024
|
||||
# Anthropic enforces an 8000px per-side dimension cap independently of
|
||||
# the 5 MB byte cap. A tall screenshot can be well under 5 MB yet far
|
||||
# over 8000px (e.g. 1200×12000 at 0.06 MB). We check pixel dimensions
|
||||
# even when the byte budget is fine.
|
||||
max_dimension = 8000
|
||||
changed_count = 0
|
||||
# Track parts that are over the target but could NOT be shrunk under it.
|
||||
# If any survive, retrying is pointless — the same oversized payload will
|
||||
@ -658,9 +663,30 @@ def try_shrink_image_parts_in_messages(api_messages: list) -> bool:
|
||||
"""Return a smaller data URL, or None if shrink can't help."""
|
||||
if not isinstance(url, str) or not url.startswith("data:"):
|
||||
return None
|
||||
if len(url) <= target_bytes:
|
||||
# This specific image wasn't the oversized one.
|
||||
return None
|
||||
|
||||
# Check both byte size AND pixel dimensions.
|
||||
needs_shrink = len(url) > target_bytes # over byte budget
|
||||
if not needs_shrink:
|
||||
# Even if bytes are fine, check pixel dimensions against
|
||||
# Anthropic's 8000px cap. A tall image can be tiny in bytes
|
||||
# yet huge in pixels.
|
||||
try:
|
||||
import base64 as _b64_dim
|
||||
header_d, _, data_d = url.partition(",")
|
||||
if not data_d:
|
||||
return None
|
||||
raw_d = _b64_dim.b64decode(data_d)
|
||||
from PIL import Image as _PILImage
|
||||
import io as _io_dim
|
||||
with _PILImage.open(_io_dim.BytesIO(raw_d)) as _img:
|
||||
if max(_img.size) <= max_dimension:
|
||||
return None # both bytes and pixels are fine
|
||||
needs_shrink = True # pixels exceed limit, force shrink
|
||||
except Exception:
|
||||
# If we can't check dimensions (Pillow unavailable, corrupt
|
||||
# image, etc.), fall back to byte-only check.
|
||||
return None
|
||||
|
||||
try:
|
||||
header, _, data = url.partition(",")
|
||||
mime = "image/jpeg"
|
||||
@ -684,6 +710,7 @@ def try_shrink_image_parts_in_messages(api_messages: list) -> bool:
|
||||
Path(tmp.name),
|
||||
mime_type=mime,
|
||||
max_base64_bytes=target_bytes,
|
||||
max_dimension=max_dimension,
|
||||
)
|
||||
finally:
|
||||
try:
|
||||
|
||||
@ -171,6 +171,7 @@ _IMAGE_TOO_LARGE_PATTERNS = [
|
||||
"image too large", # generic
|
||||
"image_too_large", # error_code variant
|
||||
"image size exceeds", # variant
|
||||
"image dimensions exceed", # Anthropic: "image dimensions exceed max allowed size: 8000 pixels"
|
||||
# "request_too_large" on a request known to contain an image → image is
|
||||
# the likely culprit; we still try the shrink path before giving up.
|
||||
]
|
||||
|
||||
@ -143,7 +143,7 @@ class TestShrinkImagePartsHelper:
|
||||
oversized_url = _big_png_data_url(5000) # ~5 MB raw → ~6.7 MB b64
|
||||
shrunk = "data:image/jpeg;base64," + "A" * 1000 # small
|
||||
|
||||
def _fake_resize(path, mime_type=None, max_base64_bytes=None):
|
||||
def _fake_resize(path, mime_type=None, max_base64_bytes=None, max_dimension=None):
|
||||
return shrunk
|
||||
|
||||
monkeypatch.setattr(
|
||||
|
||||
@ -342,20 +342,40 @@ def _is_image_size_error(error: Exception) -> bool:
|
||||
|
||||
|
||||
def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
|
||||
max_base64_bytes: int = _RESIZE_TARGET_BYTES) -> str:
|
||||
max_base64_bytes: int = _RESIZE_TARGET_BYTES,
|
||||
max_dimension: Optional[int] = None) -> str:
|
||||
"""Convert an image to a base64 data URL, auto-resizing if too large.
|
||||
|
||||
Tries Pillow first to progressively downscale oversized images. If Pillow
|
||||
is not installed or resizing still exceeds the limit, falls back to the raw
|
||||
bytes and lets the caller handle the size check.
|
||||
|
||||
Args:
|
||||
max_dimension: If set, images whose longest side exceeds this pixel
|
||||
count are forcibly downscaled even if they're under the byte
|
||||
budget. Anthropic enforces an 8000 px per-side cap independently
|
||||
of the 5 MB byte cap.
|
||||
|
||||
Returns the base64 data URL string.
|
||||
"""
|
||||
# Quick file-size estimate: base64 expands by ~4/3, plus data URL header.
|
||||
# Skip the expensive full-read + encode if Pillow can resize directly.
|
||||
file_size = image_path.stat().st_size
|
||||
estimated_b64 = (file_size * 4) // 3 + 100 # ~header overhead
|
||||
if estimated_b64 <= max_base64_bytes:
|
||||
needs_resize_for_bytes = estimated_b64 > max_base64_bytes
|
||||
|
||||
# Check pixel dimensions even if bytes are fine.
|
||||
needs_resize_for_dims = False
|
||||
if max_dimension is not None:
|
||||
try:
|
||||
from PIL import Image as _PILQuick
|
||||
with _PILQuick.open(image_path) as _quick_img:
|
||||
if max(_quick_img.size) > max_dimension:
|
||||
needs_resize_for_dims = True
|
||||
except Exception:
|
||||
pass # can't check; Pillow path below will handle or skip
|
||||
|
||||
if not needs_resize_for_bytes and not needs_resize_for_dims:
|
||||
# Small enough — just encode directly.
|
||||
data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
|
||||
if len(data_url) <= max_base64_bytes:
|
||||
@ -373,9 +393,9 @@ def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
|
||||
data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
|
||||
return data_url # caller will raise the size error
|
||||
|
||||
logger.info("Image file is %.1f MB (estimated base64 %.1f MB, limit %.1f MB), auto-resizing...",
|
||||
logger.info("Image file is %.1f MB (estimated base64 %.1f MB, limit %.1f MB, max_dimension=%s), auto-resizing...",
|
||||
file_size / (1024 * 1024), estimated_b64 / (1024 * 1024),
|
||||
max_base64_bytes / (1024 * 1024))
|
||||
max_base64_bytes / (1024 * 1024), max_dimension)
|
||||
|
||||
mime = mime_type or _determine_mime_type(image_path)
|
||||
# Choose output format: JPEG for photos (smaller), PNG for transparency
|
||||
@ -393,13 +413,20 @@ def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
|
||||
if pil_format == "JPEG" and img.mode in {"RGBA", "P"}:
|
||||
img = img.convert("RGB")
|
||||
|
||||
# Strategy: halve dimensions until base64 fits, up to 4 rounds.
|
||||
# Strategy: halve dimensions until both base64 fits AND pixel dimensions
|
||||
# are within limits, up to 4 rounds.
|
||||
# For JPEG, also try reducing quality at each size step.
|
||||
# For PNG, quality is irrelevant — only dimension reduction helps.
|
||||
quality_steps = (85, 70, 50) if pil_format == "JPEG" else (None,)
|
||||
prev_dims = (img.width, img.height)
|
||||
candidate = None # will be set on first loop iteration
|
||||
|
||||
def _dims_ok(w: int, h: int) -> bool:
|
||||
"""True if both pixel dimensions are within the limit."""
|
||||
if max_dimension is None:
|
||||
return True
|
||||
return max(w, h) <= max_dimension
|
||||
|
||||
for attempt in range(5):
|
||||
if attempt > 0:
|
||||
# Proportional scaling: halve the longer side and scale the
|
||||
@ -430,7 +457,7 @@ def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
|
||||
img.save(buf, **save_kwargs)
|
||||
encoded = base64.b64encode(buf.getvalue()).decode("ascii")
|
||||
candidate = f"data:{out_mime};base64,{encoded}"
|
||||
if len(candidate) <= max_base64_bytes:
|
||||
if len(candidate) <= max_base64_bytes and _dims_ok(img.width, img.height):
|
||||
logger.info("Auto-resized image fits: %.1f MB (quality=%s, %dx%d)",
|
||||
len(candidate) / (1024 * 1024), q,
|
||||
img.width, img.height)
|
||||
|
||||
Reference in New Issue
Block a user