From f8b8dffccf48b1abfad68bf4fb1521a37ff1a53d Mon Sep 17 00:00:00 2001
From: tillfalko <tillfalko@gmail.com>
Date: Thu, 21 May 2026 17:31:35 +0200
Subject: [PATCH] fix(browser): add native image support to browser_vision and
 respect supports_vision

---
 tools/browser_tool.py | 50 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/tools/browser_tool.py b/tools/browser_tool.py
index 926e6c3e5..4790dc701 100644
--- a/tools/browser_tool.py
+++ b/tools/browser_tool.py
@@ -3187,6 +3187,56 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
         _screenshot_b64 = base64.b64encode(_screenshot_bytes).decode("ascii")
         data_url = f"data:image/png;base64,{_screenshot_b64}"
 
+        # Fast path: when the active main model supports native vision AND the
+        # provider supports image content inside tool results, short-circuit
+        # the auxiliary LLM and return the image bytes as a multimodal
+        # tool-result envelope. The user can force native vision with the 
+        # supports_vision override. The main model sees the pixels directly on its
+        # next turn — no aux call, no information loss, no extra latency.
+        try:
+            from agent.auxiliary_client import _read_main_model, _read_main_provider
+            from agent.image_routing import decide_image_input_mode, _lookup_supports_vision
+            from hermes_cli.config import load_config
+            from tools.vision_tools import (
+                _build_native_vision_tool_result,
+                _supports_media_in_tool_results,
+            )
+
+            _provider = _read_main_provider()
+            _model = _read_main_model()
+            _cfg = load_config()
+            _mode = decide_image_input_mode(_provider, _model, _cfg)
+            _supports_vision = _lookup_supports_vision(_provider, _model, _cfg) is True
+            if _mode == "native" and (
+                _supports_media_in_tool_results(_provider, _model)
+                or _supports_vision
+            ):
+                native_result = _build_native_vision_tool_result(
+                    image_url=str(screenshot_path),
+                    question=question,
+                    image_data_url=data_url,
+                    image_size_bytes=len(_screenshot_bytes),
+                )
+                native_result.setdefault("meta", {})
+                native_result["meta"]["screenshot_path"] = str(screenshot_path)
+                if _lp_fallback_warning:
+                    native_result["meta"]["fallback_warning"] = _lp_fallback_warning
+                if annotate and result.get("data", {}).get("annotations"):
+                    native_result["meta"]["annotations"] = result["data"]["annotations"]
+                text_parts = native_result.get("content") or []
+                if text_parts and isinstance(text_parts[0], dict) and text_parts[0].get("type") == "text":
+                    text_parts[0]["text"] = (
+                        str(text_parts[0].get("text", ""))
+                        + f"\n\nScreenshot path: {screenshot_path}"
+                    )
+                native_result["text_summary"] = (
+                    str(native_result.get("text_summary") or "")
+                    + f" Screenshot path: {screenshot_path}"
+                ).strip()
+                return native_result
+        except Exception:
+            pass
+
         vision_prompt = (
             f"You are analyzing a screenshot of a web browser.\n\n"
             f"User's question: {question}\n\n"