docs(browser): update browser_vision tool description for native vision routing

2026-05-21 21:07:32 +02:00
parent 2402ec5e7b
commit c3f28c651d
1 changed files with 12 additions and 9 deletions
--- a/tools/browser_tool.py
+++ b/tools/browser_tool.py
@ -1578,7 +1578,7 @@ BROWSER_TOOL_SCHEMAS = [
    },
    {
        "name": "browser_vision",
-        "description": "Take a screenshot of the current page and analyze it with vision AI. Use this when you need to visually understand what's on the page - especially useful for CAPTCHAs, visual verification challenges, complex layouts, or when the text snapshot doesn't capture important visual information. Returns both the AI analysis and a screenshot_path that you can share with the user by including MEDIA:<screenshot_path> in your response. Requires browser_navigate to be called first.",
+        "description": "Take a screenshot of the current page so you can inspect it visually. Use this when you need to understand what the page looks like - especially for CAPTCHAs, visual verification challenges, complex layouts, or cases where the text snapshot misses important visual information. When your active model has native vision, the screenshot is attached to your context directly and you inspect it on the next turn; otherwise Hermes falls back to an auxiliary vision model and returns a text analysis. Includes a screenshot_path that you can share with the user by including MEDIA:<screenshot_path> in your response. Requires browser_navigate to be called first.",
        "parameters": {
            "type": "object",
            "properties": {
@ -3046,15 +3046,17 @@ def browser_get_images(task_id: Optional[str] = None) -> str:

 def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] = None) -> str:
    """
-    Take a screenshot of the current page and analyze it with vision AI.
+    Take a screenshot of the current page for visual inspection.

-    This tool captures what's visually displayed in the browser and sends it
-    to Gemini for analysis. Useful for understanding visual content that the
-    text-based snapshot may not capture (CAPTCHAs, verification challenges,
-    images, complex layouts, etc.).
+    This tool captures what's visually displayed in the browser. When the
+    active model supports native vision, the screenshot is attached directly
+    to the conversation so the model can inspect it on the next turn.
+    Otherwise Hermes falls back to the auxiliary vision model. Useful for
+    understanding visual content that the text-based snapshot may not capture
+    (CAPTCHAs, verification challenges, images, complex layouts, etc.).

-    The screenshot is saved persistently and its file path is returned alongside
-    the analysis, so it can be shared with users via MEDIA:<path> in the response.
+    The screenshot is saved persistently and its file path is returned so it
+    can be shared with users via MEDIA:<path> in the response.

    Args:
        question: What you want to know about the page visually
@ -3062,7 +3064,8 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
        task_id: Task identifier for session isolation

    Returns:
-        JSON string with vision analysis results and screenshot_path
+        Either a JSON string with vision analysis results and screenshot_path,
+        or a multimodal tool-result envelope with the screenshot and metadata.
    """
    if _is_camofox_mode():
        from tools.browser_camofox import camofox_vision