fix(vision): guard image pixel dimensions, not just bytes (#37677)

Anthropic enforces two independent ceilings per image:
1. 5 MB encoded byte size
2. 8000 px longest side

Hermes only guarded #1. A tall screenshot (e.g. 1200x12000 at 0.06 MB)
passes every byte check but fails the pixel check, returning a
non-retryable HTTP 400 that permanently bricks the conversation thread.

Fixes:
- error_classifier: add 'image dimensions exceed' pattern to
  _IMAGE_TOO_LARGE_PATTERNS so the 400 is classified as image_too_large
  and triggers the shrink/retry path instead of falling through to
  non-retryable error.
- conversation_compression: check pixel dimensions (via Pillow) even
  when byte size is under the 4 MB target. If max(dims) > 8000, force
  shrink.
- vision_tools._resize_image_for_vision: add optional max_dimension param.
  When set, images exceeding the pixel cap are downscaled even if they're
  under the byte budget. The resize loop now checks both byte AND pixel
  limits before accepting a candidate.

Closes #37677
This commit is contained in:
kyssta-exe
2026-06-02 23:45:22 +00:00
committed by Teknium
parent f7dabd3019
commit 6bdbe30763
4 changed files with 65 additions and 10 deletions

View File

@ -646,6 +646,11 @@ def try_shrink_image_parts_in_messages(api_messages: list) -> bool:
# much larger; shrinking to 4 MB here loses quality but only fires
# after a confirmed provider rejection, so the alternative is failure.
target_bytes = 4 * 1024 * 1024
# Anthropic enforces an 8000px per-side dimension cap independently of
# the 5 MB byte cap. A tall screenshot can be well under 5 MB yet far
# over 8000px (e.g. 1200×12000 at 0.06 MB). We check pixel dimensions
# even when the byte budget is fine.
max_dimension = 8000
changed_count = 0
# Track parts that are over the target but could NOT be shrunk under it.
# If any survive, retrying is pointless — the same oversized payload will
@ -658,9 +663,30 @@ def try_shrink_image_parts_in_messages(api_messages: list) -> bool:
"""Return a smaller data URL, or None if shrink can't help."""
if not isinstance(url, str) or not url.startswith("data:"):
return None
if len(url) <= target_bytes:
# This specific image wasn't the oversized one.
# Check both byte size AND pixel dimensions.
needs_shrink = len(url) > target_bytes # over byte budget
if not needs_shrink:
# Even if bytes are fine, check pixel dimensions against
# Anthropic's 8000px cap. A tall image can be tiny in bytes
# yet huge in pixels.
try:
import base64 as _b64_dim
header_d, _, data_d = url.partition(",")
if not data_d:
return None
raw_d = _b64_dim.b64decode(data_d)
from PIL import Image as _PILImage
import io as _io_dim
with _PILImage.open(_io_dim.BytesIO(raw_d)) as _img:
if max(_img.size) <= max_dimension:
return None # both bytes and pixels are fine
needs_shrink = True # pixels exceed limit, force shrink
except Exception:
# If we can't check dimensions (Pillow unavailable, corrupt
# image, etc.), fall back to byte-only check.
return None
try:
header, _, data = url.partition(",")
mime = "image/jpeg"
@ -684,6 +710,7 @@ def try_shrink_image_parts_in_messages(api_messages: list) -> bool:
Path(tmp.name),
mime_type=mime,
max_base64_bytes=target_bytes,
max_dimension=max_dimension,
)
finally:
try:

View File

@ -171,6 +171,7 @@ _IMAGE_TOO_LARGE_PATTERNS = [
"image too large", # generic
"image_too_large", # error_code variant
"image size exceeds", # variant
"image dimensions exceed", # Anthropic: "image dimensions exceed max allowed size: 8000 pixels"
# "request_too_large" on a request known to contain an image → image is
# the likely culprit; we still try the shrink path before giving up.
]

View File

@ -143,7 +143,7 @@ class TestShrinkImagePartsHelper:
oversized_url = _big_png_data_url(5000) # ~5 MB raw → ~6.7 MB b64
shrunk = "data:image/jpeg;base64," + "A" * 1000 # small
def _fake_resize(path, mime_type=None, max_base64_bytes=None):
def _fake_resize(path, mime_type=None, max_base64_bytes=None, max_dimension=None):
return shrunk
monkeypatch.setattr(

View File

@ -342,20 +342,40 @@ def _is_image_size_error(error: Exception) -> bool:
def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
max_base64_bytes: int = _RESIZE_TARGET_BYTES) -> str:
max_base64_bytes: int = _RESIZE_TARGET_BYTES,
max_dimension: Optional[int] = None) -> str:
"""Convert an image to a base64 data URL, auto-resizing if too large.
Tries Pillow first to progressively downscale oversized images. If Pillow
is not installed or resizing still exceeds the limit, falls back to the raw
bytes and lets the caller handle the size check.
Args:
max_dimension: If set, images whose longest side exceeds this pixel
count are forcibly downscaled even if they're under the byte
budget. Anthropic enforces an 8000 px per-side cap independently
of the 5 MB byte cap.
Returns the base64 data URL string.
"""
# Quick file-size estimate: base64 expands by ~4/3, plus data URL header.
# Skip the expensive full-read + encode if Pillow can resize directly.
file_size = image_path.stat().st_size
estimated_b64 = (file_size * 4) // 3 + 100 # ~header overhead
if estimated_b64 <= max_base64_bytes:
needs_resize_for_bytes = estimated_b64 > max_base64_bytes
# Check pixel dimensions even if bytes are fine.
needs_resize_for_dims = False
if max_dimension is not None:
try:
from PIL import Image as _PILQuick
with _PILQuick.open(image_path) as _quick_img:
if max(_quick_img.size) > max_dimension:
needs_resize_for_dims = True
except Exception:
pass # can't check; Pillow path below will handle or skip
if not needs_resize_for_bytes and not needs_resize_for_dims:
# Small enough — just encode directly.
data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
if len(data_url) <= max_base64_bytes:
@ -373,9 +393,9 @@ def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
return data_url # caller will raise the size error
logger.info("Image file is %.1f MB (estimated base64 %.1f MB, limit %.1f MB), auto-resizing...",
logger.info("Image file is %.1f MB (estimated base64 %.1f MB, limit %.1f MB, max_dimension=%s), auto-resizing...",
file_size / (1024 * 1024), estimated_b64 / (1024 * 1024),
max_base64_bytes / (1024 * 1024))
max_base64_bytes / (1024 * 1024), max_dimension)
mime = mime_type or _determine_mime_type(image_path)
# Choose output format: JPEG for photos (smaller), PNG for transparency
@ -393,13 +413,20 @@ def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
if pil_format == "JPEG" and img.mode in {"RGBA", "P"}:
img = img.convert("RGB")
# Strategy: halve dimensions until base64 fits, up to 4 rounds.
# Strategy: halve dimensions until both base64 fits AND pixel dimensions
# are within limits, up to 4 rounds.
# For JPEG, also try reducing quality at each size step.
# For PNG, quality is irrelevant — only dimension reduction helps.
quality_steps = (85, 70, 50) if pil_format == "JPEG" else (None,)
prev_dims = (img.width, img.height)
candidate = None # will be set on first loop iteration
def _dims_ok(w: int, h: int) -> bool:
"""True if both pixel dimensions are within the limit."""
if max_dimension is None:
return True
return max(w, h) <= max_dimension
for attempt in range(5):
if attempt > 0:
# Proportional scaling: halve the longer side and scale the
@ -430,7 +457,7 @@ def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
img.save(buf, **save_kwargs)
encoded = base64.b64encode(buf.getvalue()).decode("ascii")
candidate = f"data:{out_mime};base64,{encoded}"
if len(candidate) <= max_base64_bytes:
if len(candidate) <= max_base64_bytes and _dims_ok(img.width, img.height):
logger.info("Auto-resized image fits: %.1f MB (quality=%s, %dx%d)",
len(candidate) / (1024 * 1024), q,
img.width, img.height)