docs(browser): update browser_vision tool description for native vision routing

This commit is contained in:
tillfalko
2026-05-21 21:07:32 +02:00
committed by Teknium
parent 2402ec5e7b
commit c3f28c651d

View File

@ -1578,7 +1578,7 @@ BROWSER_TOOL_SCHEMAS = [
},
{
"name": "browser_vision",
"description": "Take a screenshot of the current page and analyze it with vision AI. Use this when you need to visually understand what's on the page - especially useful for CAPTCHAs, visual verification challenges, complex layouts, or when the text snapshot doesn't capture important visual information. Returns both the AI analysis and a screenshot_path that you can share with the user by including MEDIA:<screenshot_path> in your response. Requires browser_navigate to be called first.",
"description": "Take a screenshot of the current page so you can inspect it visually. Use this when you need to understand what the page looks like - especially for CAPTCHAs, visual verification challenges, complex layouts, or cases where the text snapshot misses important visual information. When your active model has native vision, the screenshot is attached to your context directly and you inspect it on the next turn; otherwise Hermes falls back to an auxiliary vision model and returns a text analysis. Includes a screenshot_path that you can share with the user by including MEDIA:<screenshot_path> in your response. Requires browser_navigate to be called first.",
"parameters": {
"type": "object",
"properties": {
@ -3046,15 +3046,17 @@ def browser_get_images(task_id: Optional[str] = None) -> str:
def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] = None) -> str:
"""
Take a screenshot of the current page and analyze it with vision AI.
Take a screenshot of the current page for visual inspection.
This tool captures what's visually displayed in the browser and sends it
to Gemini for analysis. Useful for understanding visual content that the
text-based snapshot may not capture (CAPTCHAs, verification challenges,
images, complex layouts, etc.).
This tool captures what's visually displayed in the browser. When the
active model supports native vision, the screenshot is attached directly
to the conversation so the model can inspect it on the next turn.
Otherwise Hermes falls back to the auxiliary vision model. Useful for
understanding visual content that the text-based snapshot may not capture
(CAPTCHAs, verification challenges, images, complex layouts, etc.).
The screenshot is saved persistently and its file path is returned alongside
the analysis, so it can be shared with users via MEDIA:<path> in the response.
The screenshot is saved persistently and its file path is returned so it
can be shared with users via MEDIA:<path> in the response.
Args:
question: What you want to know about the page visually
@ -3062,7 +3064,8 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
task_id: Task identifier for session isolation
Returns:
JSON string with vision analysis results and screenshot_path
Either a JSON string with vision analysis results and screenshot_path,
or a multimodal tool-result envelope with the screenshot and metadata.
"""
if _is_camofox_mode():
from tools.browser_camofox import camofox_vision