From f8b8dffccf48b1abfad68bf4fb1521a37ff1a53d Mon Sep 17 00:00:00 2001 From: tillfalko Date: Thu, 21 May 2026 17:31:35 +0200 Subject: [PATCH] fix(browser): add native image support to browser_vision and respect supports_vision --- tools/browser_tool.py | 50 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/tools/browser_tool.py b/tools/browser_tool.py index 926e6c3e5..4790dc701 100644 --- a/tools/browser_tool.py +++ b/tools/browser_tool.py @@ -3187,6 +3187,56 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] _screenshot_b64 = base64.b64encode(_screenshot_bytes).decode("ascii") data_url = f"data:image/png;base64,{_screenshot_b64}" + # Fast path: when the active main model supports native vision AND the + # provider supports image content inside tool results, short-circuit + # the auxiliary LLM and return the image bytes as a multimodal + # tool-result envelope. The user can force native vision with the + # supports_vision override. The main model sees the pixels directly on its + # next turn — no aux call, no information loss, no extra latency. + try: + from agent.auxiliary_client import _read_main_model, _read_main_provider + from agent.image_routing import decide_image_input_mode, _lookup_supports_vision + from hermes_cli.config import load_config + from tools.vision_tools import ( + _build_native_vision_tool_result, + _supports_media_in_tool_results, + ) + + _provider = _read_main_provider() + _model = _read_main_model() + _cfg = load_config() + _mode = decide_image_input_mode(_provider, _model, _cfg) + _supports_vision = _lookup_supports_vision(_provider, _model, _cfg) is True + if _mode == "native" and ( + _supports_media_in_tool_results(_provider, _model) + or _supports_vision + ): + native_result = _build_native_vision_tool_result( + image_url=str(screenshot_path), + question=question, + image_data_url=data_url, + image_size_bytes=len(_screenshot_bytes), + ) + native_result.setdefault("meta", {}) + native_result["meta"]["screenshot_path"] = str(screenshot_path) + if _lp_fallback_warning: + native_result["meta"]["fallback_warning"] = _lp_fallback_warning + if annotate and result.get("data", {}).get("annotations"): + native_result["meta"]["annotations"] = result["data"]["annotations"] + text_parts = native_result.get("content") or [] + if text_parts and isinstance(text_parts[0], dict) and text_parts[0].get("type") == "text": + text_parts[0]["text"] = ( + str(text_parts[0].get("text", "")) + + f"\n\nScreenshot path: {screenshot_path}" + ) + native_result["text_summary"] = ( + str(native_result.get("text_summary") or "") + + f" Screenshot path: {screenshot_path}" + ).strip() + return native_result + except Exception: + pass + vision_prompt = ( f"You are analyzing a screenshot of a web browser.\n\n" f"User's question: {question}\n\n"