From f736d2be86b8a76d2ced3d41ced8b172c24a1eeb Mon Sep 17 00:00:00 2001
From: Kewe63 <Kewe63@users.noreply.github.com>
Date: Fri, 15 May 2026 16:56:05 +0300
Subject: [PATCH] fix(vision): detect vision-capable custom providers via
 ProviderProfile flag

_supports_media_in_tool_results() had a hardcoded provider allowlist
that missed custom providers and newer vision-capable providers like
xiaomi. Added ProviderProfile.supports_vision flag and made the
function check:

1. Registered provider profile (supports_vision flag)
2. Model capabilities from models.dev catalog (supports_vision)
3. Existing hardcoded allowlist (unchanged)

This fixes HTTP 400 "text is not set" errors when vision-capable
custom providers receive text-only tool results instead of
multipart image content.

Related: #25594
---
 plugins/model-providers/xiaomi/__init__.py |  1 +
 providers/base.py                          |  9 ++++++++
 tools/vision_tools.py                      | 25 +++++++++++++++++++++-
 3 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/plugins/model-providers/xiaomi/__init__.py b/plugins/model-providers/xiaomi/__init__.py
index aed0d8424..93c7dbb29 100644
--- a/plugins/model-providers/xiaomi/__init__.py
+++ b/plugins/model-providers/xiaomi/__init__.py
@@ -9,6 +9,7 @@ xiaomi = ProviderProfile(
     env_vars=("XIAOMI_API_KEY",),
     base_url="https://api.xiaomimimo.com/v1",
     supports_health_check=False,  # /v1/models returns 401 even with valid key
+    supports_vision=True,  # mimo-v2-omni is vision-capable
 )
 
 register_provider(xiaomi)
diff --git a/providers/base.py b/providers/base.py
index 01023ff55..d7ff470d8 100644
--- a/providers/base.py
+++ b/providers/base.py
@@ -56,6 +56,15 @@ class ProviderProfile:
     auth_type: str = "api_key"   # api_key|oauth_device_code|oauth_external|copilot|aws_sdk
     supports_health_check: bool = True  # False → doctor skips /models probe for this provider
 
+    # ── Vision support ────────────────────────────────────────
+    # True when the provider's API accepts image content inside
+    # tool-result messages natively.  Set on providers that expose
+    # multimodal models via tool results (Anthropic Messages API,
+    # OpenAI Chat Completions, Gemini, Xiaomi, MiniMax, etc.).
+    # Falls back to model-catalog lookup when False and the provider
+    # has no registered profile.
+    supports_vision: bool = False
+
     # ── Model catalog ─────────────────────────────────────────
     # fallback_models: curated list shown in /model picker when live fetch fails.
     # Only agentic models that support tool calling should appear here.
diff --git a/tools/vision_tools.py b/tools/vision_tools.py
index 10e97298a..253856b9b 100644
--- a/tools/vision_tools.py
+++ b/tools/vision_tools.py
@@ -540,7 +540,9 @@ def _supports_media_in_tool_results(provider: str, model: str) -> bool:
         results. Older Gemini does NOT.
 
     For unknown / legacy providers we conservatively return False — the
-    caller falls back to the legacy aux-LLM text path.
+    caller falls back to the legacy aux-LLM text path.  The check is relaxed
+    when the provider's ``ProviderProfile`` declares ``supports_vision=True``
+    or when ``get_model_capabilities`` reports vision support for the model.
     """
     if not isinstance(provider, str):
         return False
@@ -577,6 +579,27 @@ def _supports_media_in_tool_results(provider: str, model: str) -> bool:
             return True
         return False
 
+    # Check the provider's registered profile for the supports_vision flag.
+    # This covers vision-capable providers like xiaomi, minimax, etc. that
+    # aren't in the hardcoded list above.
+    try:
+        from providers import get_provider_profile
+        profile = get_provider_profile(p)
+        if profile is not None and profile.supports_vision:
+            return True
+    except Exception:
+        pass
+
+    # Check model capabilities from the models.dev catalog as a final
+    # fallback for custom providers whose models happen to be registered.
+    try:
+        from agent.models_dev import get_model_capabilities
+        caps = get_model_capabilities(provider, model)
+        if caps is not None and bool(getattr(caps, "supports_vision", False)):
+            return True
+    except Exception:
+        pass
+
     # Other vision-capable provider stacks. Conservative default: False.
     # Add explicit entries here as we verify each provider's tool-result
     # multimodal support empirically.