From f736d2be86b8a76d2ced3d41ced8b172c24a1eeb Mon Sep 17 00:00:00 2001 From: Kewe63 Date: Fri, 15 May 2026 16:56:05 +0300 Subject: [PATCH] fix(vision): detect vision-capable custom providers via ProviderProfile flag _supports_media_in_tool_results() had a hardcoded provider allowlist that missed custom providers and newer vision-capable providers like xiaomi. Added ProviderProfile.supports_vision flag and made the function check: 1. Registered provider profile (supports_vision flag) 2. Model capabilities from models.dev catalog (supports_vision) 3. Existing hardcoded allowlist (unchanged) This fixes HTTP 400 "text is not set" errors when vision-capable custom providers receive text-only tool results instead of multipart image content. Related: #25594 --- plugins/model-providers/xiaomi/__init__.py | 1 + providers/base.py | 9 ++++++++ tools/vision_tools.py | 25 +++++++++++++++++++++- 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/plugins/model-providers/xiaomi/__init__.py b/plugins/model-providers/xiaomi/__init__.py index aed0d8424..93c7dbb29 100644 --- a/plugins/model-providers/xiaomi/__init__.py +++ b/plugins/model-providers/xiaomi/__init__.py @@ -9,6 +9,7 @@ xiaomi = ProviderProfile( env_vars=("XIAOMI_API_KEY",), base_url="https://api.xiaomimimo.com/v1", supports_health_check=False, # /v1/models returns 401 even with valid key + supports_vision=True, # mimo-v2-omni is vision-capable ) register_provider(xiaomi) diff --git a/providers/base.py b/providers/base.py index 01023ff55..d7ff470d8 100644 --- a/providers/base.py +++ b/providers/base.py @@ -56,6 +56,15 @@ class ProviderProfile: auth_type: str = "api_key" # api_key|oauth_device_code|oauth_external|copilot|aws_sdk supports_health_check: bool = True # False → doctor skips /models probe for this provider + # ── Vision support ──────────────────────────────────────── + # True when the provider's API accepts image content inside + # tool-result messages natively. Set on providers that expose + # multimodal models via tool results (Anthropic Messages API, + # OpenAI Chat Completions, Gemini, Xiaomi, MiniMax, etc.). + # Falls back to model-catalog lookup when False and the provider + # has no registered profile. + supports_vision: bool = False + # ── Model catalog ───────────────────────────────────────── # fallback_models: curated list shown in /model picker when live fetch fails. # Only agentic models that support tool calling should appear here. diff --git a/tools/vision_tools.py b/tools/vision_tools.py index 10e97298a..253856b9b 100644 --- a/tools/vision_tools.py +++ b/tools/vision_tools.py @@ -540,7 +540,9 @@ def _supports_media_in_tool_results(provider: str, model: str) -> bool: results. Older Gemini does NOT. For unknown / legacy providers we conservatively return False — the - caller falls back to the legacy aux-LLM text path. + caller falls back to the legacy aux-LLM text path. The check is relaxed + when the provider's ``ProviderProfile`` declares ``supports_vision=True`` + or when ``get_model_capabilities`` reports vision support for the model. """ if not isinstance(provider, str): return False @@ -577,6 +579,27 @@ def _supports_media_in_tool_results(provider: str, model: str) -> bool: return True return False + # Check the provider's registered profile for the supports_vision flag. + # This covers vision-capable providers like xiaomi, minimax, etc. that + # aren't in the hardcoded list above. + try: + from providers import get_provider_profile + profile = get_provider_profile(p) + if profile is not None and profile.supports_vision: + return True + except Exception: + pass + + # Check model capabilities from the models.dev catalog as a final + # fallback for custom providers whose models happen to be registered. + try: + from agent.models_dev import get_model_capabilities + caps = get_model_capabilities(provider, model) + if caps is not None and bool(getattr(caps, "supports_vision", False)): + return True + except Exception: + pass + # Other vision-capable provider stacks. Conservative default: False. # Add explicit entries here as we verify each provider's tool-result # multimodal support empirically.