From d33d23c8526c543ca38ca704f76171c3cec44c3f Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Thu, 4 Jun 2026 17:31:51 -0700 Subject: [PATCH] fix(vision): drop models.dev catalog fallback, keep explicit profile flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The models.dev supports_vision field reflects model IMAGE-INPUT capability, which is not the same contract as 'provider API accepts images inside tool-result messages' — the looser heuristic could re-introduce the exact HTTP 400 'text is not set' it aims to fix. Keep only the explicit, opt-in ProviderProfile.supports_vision flag (set on xiaomi); add catalog-based detection later if a concrete provider needs it. --- tools/vision_tools.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/tools/vision_tools.py b/tools/vision_tools.py index 253856b9b..0def28142 100644 --- a/tools/vision_tools.py +++ b/tools/vision_tools.py @@ -541,8 +541,7 @@ def _supports_media_in_tool_results(provider: str, model: str) -> bool: For unknown / legacy providers we conservatively return False — the caller falls back to the legacy aux-LLM text path. The check is relaxed - when the provider's ``ProviderProfile`` declares ``supports_vision=True`` - or when ``get_model_capabilities`` reports vision support for the model. + when the provider's ``ProviderProfile`` declares ``supports_vision=True``. """ if not isinstance(provider, str): return False @@ -590,16 +589,6 @@ def _supports_media_in_tool_results(provider: str, model: str) -> bool: except Exception: pass - # Check model capabilities from the models.dev catalog as a final - # fallback for custom providers whose models happen to be registered. - try: - from agent.models_dev import get_model_capabilities - caps = get_model_capabilities(provider, model) - if caps is not None and bool(getattr(caps, "supports_vision", False)): - return True - except Exception: - pass - # Other vision-capable provider stacks. Conservative default: False. # Add explicit entries here as we verify each provider's tool-result # multimodal support empirically.