fix(vision): detect vision-capable custom providers via ProviderProfile flag
_supports_media_in_tool_results() had a hardcoded provider allowlist that missed custom providers and newer vision-capable providers like xiaomi. Added ProviderProfile.supports_vision flag and made the function check: 1. Registered provider profile (supports_vision flag) 2. Model capabilities from models.dev catalog (supports_vision) 3. Existing hardcoded allowlist (unchanged) This fixes HTTP 400 "text is not set" errors when vision-capable custom providers receive text-only tool results instead of multipart image content. Related: #25594
This commit is contained in:
@ -9,6 +9,7 @@ xiaomi = ProviderProfile(
|
||||
env_vars=("XIAOMI_API_KEY",),
|
||||
base_url="https://api.xiaomimimo.com/v1",
|
||||
supports_health_check=False, # /v1/models returns 401 even with valid key
|
||||
supports_vision=True, # mimo-v2-omni is vision-capable
|
||||
)
|
||||
|
||||
register_provider(xiaomi)
|
||||
|
||||
@ -56,6 +56,15 @@ class ProviderProfile:
|
||||
auth_type: str = "api_key" # api_key|oauth_device_code|oauth_external|copilot|aws_sdk
|
||||
supports_health_check: bool = True # False → doctor skips /models probe for this provider
|
||||
|
||||
# ── Vision support ────────────────────────────────────────
|
||||
# True when the provider's API accepts image content inside
|
||||
# tool-result messages natively. Set on providers that expose
|
||||
# multimodal models via tool results (Anthropic Messages API,
|
||||
# OpenAI Chat Completions, Gemini, Xiaomi, MiniMax, etc.).
|
||||
# Falls back to model-catalog lookup when False and the provider
|
||||
# has no registered profile.
|
||||
supports_vision: bool = False
|
||||
|
||||
# ── Model catalog ─────────────────────────────────────────
|
||||
# fallback_models: curated list shown in /model picker when live fetch fails.
|
||||
# Only agentic models that support tool calling should appear here.
|
||||
|
||||
@ -540,7 +540,9 @@ def _supports_media_in_tool_results(provider: str, model: str) -> bool:
|
||||
results. Older Gemini does NOT.
|
||||
|
||||
For unknown / legacy providers we conservatively return False — the
|
||||
caller falls back to the legacy aux-LLM text path.
|
||||
caller falls back to the legacy aux-LLM text path. The check is relaxed
|
||||
when the provider's ``ProviderProfile`` declares ``supports_vision=True``
|
||||
or when ``get_model_capabilities`` reports vision support for the model.
|
||||
"""
|
||||
if not isinstance(provider, str):
|
||||
return False
|
||||
@ -577,6 +579,27 @@ def _supports_media_in_tool_results(provider: str, model: str) -> bool:
|
||||
return True
|
||||
return False
|
||||
|
||||
# Check the provider's registered profile for the supports_vision flag.
|
||||
# This covers vision-capable providers like xiaomi, minimax, etc. that
|
||||
# aren't in the hardcoded list above.
|
||||
try:
|
||||
from providers import get_provider_profile
|
||||
profile = get_provider_profile(p)
|
||||
if profile is not None and profile.supports_vision:
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Check model capabilities from the models.dev catalog as a final
|
||||
# fallback for custom providers whose models happen to be registered.
|
||||
try:
|
||||
from agent.models_dev import get_model_capabilities
|
||||
caps = get_model_capabilities(provider, model)
|
||||
if caps is not None and bool(getattr(caps, "supports_vision", False)):
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Other vision-capable provider stacks. Conservative default: False.
|
||||
# Add explicit entries here as we verify each provider's tool-result
|
||||
# multimodal support empirically.
|
||||
|
||||
Reference in New Issue
Block a user