fix(xai): route video models by modality

This commit is contained in:
Julien Talbot
2026-05-31 10:49:59 +04:00
committed by Teknium
parent eee32cdd52
commit 8104b20269
8 changed files with 167 additions and 19 deletions

View File

@ -21,9 +21,12 @@ delivers it.
from __future__ import annotations from __future__ import annotations
import asyncio import asyncio
import base64
import logging import logging
import mimetypes
import os import os
import uuid import uuid
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
import httpx import httpx
@ -42,7 +45,9 @@ logger = logging.getLogger(__name__)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
DEFAULT_XAI_BASE_URL = "https://api.x.ai/v1" DEFAULT_XAI_BASE_URL = "https://api.x.ai/v1"
DEFAULT_MODEL = "grok-imagine-video" DEFAULT_TEXT_TO_VIDEO_MODEL = "grok-imagine-video"
DEFAULT_IMAGE_TO_VIDEO_MODEL = "grok-imagine-video-1.5-preview"
DEFAULT_MODEL = DEFAULT_TEXT_TO_VIDEO_MODEL
DEFAULT_DURATION = 8 DEFAULT_DURATION = 8
DEFAULT_ASPECT_RATIO = "16:9" DEFAULT_ASPECT_RATIO = "16:9"
DEFAULT_RESOLUTION = "720p" DEFAULT_RESOLUTION = "720p"
@ -58,10 +63,18 @@ _MODELS: Dict[str, Dict[str, Any]] = {
"grok-imagine-video": { "grok-imagine-video": {
"display": "Grok Imagine Video", "display": "Grok Imagine Video",
"speed": "~60-240s", "speed": "~60-240s",
"strengths": "Text-to-video + image-to-video; up to 7 reference images for style/character.", "strengths": "Text-to-video; legacy image-to-video fallback.",
"price": "see https://docs.x.ai/docs/models", "price": "see https://docs.x.ai/developers/models/grok-imagine-video",
"modalities": ["text", "image"], "modalities": ["text", "image"],
}, },
"grok-imagine-video-1.5-preview": {
"display": "Grok Imagine Video 1.5 Preview",
"speed": "~60-240s",
"strengths": "Latest xAI image-to-video model.",
"price": "see https://docs.x.ai/developers/models/grok-imagine-video-1.5-preview",
"modalities": ["image"],
"aliases": ["grok-imagine-video-1.5-2026-05-30"],
},
} }
@ -111,10 +124,31 @@ def _xai_headers(api_key: str) -> Dict[str, str]:
} }
def _image_ref_to_xai_url(value: str) -> str:
"""Return a URL/data URI accepted by xAI for image inputs."""
ref = (value or "").strip()
if not ref:
return ""
lower = ref.lower()
if lower.startswith(("http://", "https://", "data:image/")):
return ref
path = Path(ref).expanduser()
if not path.is_file():
return ref
mime = mimetypes.guess_type(path.name)[0] or "application/octet-stream"
if not mime.startswith("image/"):
return ref
encoded = base64.b64encode(path.read_bytes()).decode("ascii")
return f"data:{mime};base64,{encoded}"
def _normalize_reference_images(reference_image_urls: Optional[List[str]]): def _normalize_reference_images(reference_image_urls: Optional[List[str]]):
refs = [] refs = []
for url in reference_image_urls or []: for url in reference_image_urls or []:
normalized = (url or "").strip() normalized = _image_ref_to_xai_url(url)
if normalized: if normalized:
refs.append({"url": normalized}) refs.append({"url": normalized})
return refs or None return refs or None
@ -131,6 +165,28 @@ def _clamp_duration(duration: Optional[int], has_reference_images: bool) -> int:
return value return value
def _resolve_model_for_modality(
model: Optional[str],
*,
modality: str,
explicit_model: bool,
) -> str:
"""Select xAI's text/video model without treating config as a prompt override.
``grok-imagine-video-1.5-preview`` currently rejects text-only video
generation, but it is the desired image-to-video backend. Explicit tool
``model=`` still wins for users who intentionally request another model.
"""
requested = (model or "").strip()
if explicit_model and requested:
return requested
if modality == "image":
return DEFAULT_IMAGE_TO_VIDEO_MODEL
if requested == DEFAULT_IMAGE_TO_VIDEO_MODEL:
return DEFAULT_TEXT_TO_VIDEO_MODEL
return requested or DEFAULT_TEXT_TO_VIDEO_MODEL
async def _submit( async def _submit(
client: httpx.AsyncClient, client: httpx.AsyncClient,
payload: Dict[str, Any], payload: Dict[str, Any],
@ -192,7 +248,7 @@ async def _poll(
class XAIVideoGenProvider(VideoGenProvider): class XAIVideoGenProvider(VideoGenProvider):
"""xAI grok-imagine-video backend (text-to-video + image-to-video).""" """xAI Grok Imagine video backend (text-to-video + image-to-video)."""
@property @property
def name(self) -> str: def name(self) -> str:
@ -222,7 +278,7 @@ class XAIVideoGenProvider(VideoGenProvider):
return { return {
"name": "xAI Grok Imagine", "name": "xAI Grok Imagine",
"badge": "paid", "badge": "paid",
"tag": "grok-imagine-video text-to-video & image-to-video; uses xAI Grok OAuth or XAI_API_KEY", "tag": "grok-imagine-video for text-to-video; grok-imagine-video-1.5-preview for image-to-video; uses xAI Grok OAuth or XAI_API_KEY",
"env_vars": [], "env_vars": [],
"post_setup": "xai_grok", "post_setup": "xai_grok",
} }
@ -260,6 +316,7 @@ class XAIVideoGenProvider(VideoGenProvider):
return loop.run_until_complete(self._generate_async( return loop.run_until_complete(self._generate_async(
prompt=prompt, prompt=prompt,
model=model, model=model,
explicit_model=bool(kwargs.get("_model_override_explicit")),
image_url=image_url, image_url=image_url,
reference_image_urls=reference_image_urls, reference_image_urls=reference_image_urls,
duration=duration, duration=duration,
@ -284,6 +341,7 @@ class XAIVideoGenProvider(VideoGenProvider):
*, *,
prompt: str, prompt: str,
model: Optional[str], model: Optional[str],
explicit_model: bool,
image_url: Optional[str], image_url: Optional[str],
reference_image_urls: Optional[List[str]], reference_image_urls: Optional[List[str]],
duration: Optional[int], duration: Optional[int],
@ -303,10 +361,15 @@ class XAIVideoGenProvider(VideoGenProvider):
) )
prompt = (prompt or "").strip() prompt = (prompt or "").strip()
image_url_norm = (image_url or "").strip() or None image_url_norm = _image_ref_to_xai_url(image_url or "") or None
normalized_aspect_ratio = (aspect_ratio or DEFAULT_ASPECT_RATIO).strip() normalized_aspect_ratio = (aspect_ratio or DEFAULT_ASPECT_RATIO).strip()
normalized_resolution = (resolution or DEFAULT_RESOLUTION).strip().lower() normalized_resolution = (resolution or DEFAULT_RESOLUTION).strip().lower()
modality_used = "image" if image_url_norm else "text" modality_used = "image" if image_url_norm else "text"
resolved_model = _resolve_model_for_modality(
model,
modality=modality_used,
explicit_model=explicit_model,
)
if not prompt: if not prompt:
return error_response( return error_response(
@ -340,7 +403,7 @@ class XAIVideoGenProvider(VideoGenProvider):
normalized_resolution = DEFAULT_RESOLUTION normalized_resolution = DEFAULT_RESOLUTION
payload: Dict[str, Any] = { payload: Dict[str, Any] = {
"model": model or DEFAULT_MODEL, "model": resolved_model,
"prompt": prompt, "prompt": prompt,
"duration": clamped_duration, "duration": clamped_duration,
"aspect_ratio": normalized_aspect_ratio, "aspect_ratio": normalized_aspect_ratio,
@ -366,7 +429,7 @@ class XAIVideoGenProvider(VideoGenProvider):
error=f"xAI submit failed ({exc.response.status_code}): {detail or exc}", error=f"xAI submit failed ({exc.response.status_code}): {detail or exc}",
error_type="api_error", error_type="api_error",
provider="xai", provider="xai",
model=model or DEFAULT_MODEL, model=resolved_model,
prompt=prompt, prompt=prompt,
) )
@ -388,7 +451,7 @@ class XAIVideoGenProvider(VideoGenProvider):
error="xAI video generation completed without a video URL", error="xAI video generation completed without a video URL",
error_type="empty_response", error_type="empty_response",
provider="xai", provider="xai",
model=body.get("model") or model or DEFAULT_MODEL, model=body.get("model") or resolved_model,
prompt=prompt, prompt=prompt,
) )
extra: Dict[str, Any] = { extra: Dict[str, Any] = {
@ -399,7 +462,7 @@ class XAIVideoGenProvider(VideoGenProvider):
extra["usage"] = body["usage"] extra["usage"] = body["usage"]
return success_response( return success_response(
video=url, video=url,
model=body.get("model") or model or DEFAULT_MODEL, model=body.get("model") or resolved_model,
prompt=prompt, prompt=prompt,
modality=modality_used, modality=modality_used,
aspect_ratio=normalized_aspect_ratio, aspect_ratio=normalized_aspect_ratio,
@ -413,7 +476,7 @@ class XAIVideoGenProvider(VideoGenProvider):
error=f"Timed out waiting for video generation after {DEFAULT_TIMEOUT_SECONDS}s", error=f"Timed out waiting for video generation after {DEFAULT_TIMEOUT_SECONDS}s",
error_type="timeout", error_type="timeout",
provider="xai", provider="xai",
model=model or DEFAULT_MODEL, model=resolved_model,
prompt=prompt, prompt=prompt,
) )
@ -426,7 +489,7 @@ class XAIVideoGenProvider(VideoGenProvider):
error=message, error=message,
error_type=f"xai_{status}", error_type=f"xai_{status}",
provider="xai", provider="xai",
model=model or DEFAULT_MODEL, model=resolved_model,
prompt=prompt, prompt=prompt,
) )

View File

@ -1,6 +1,6 @@
name: xai name: xai
version: 1.0.0 version: 1.0.0
description: "xAI Grok-Imagine video generation backend. Supports text-to-video, image-to-video, reference-image-guided generation, video edit, and video extend via the xAI async videos API." description: "xAI Grok Imagine video generation backend. Supports text-to-video, image-to-video, and reference-image-guided generation via the xAI async videos API."
author: NousResearch author: NousResearch
kind: backend kind: backend
requires_env: requires_env:

View File

@ -25,6 +25,43 @@ def test_xai_provider_registers():
assert provider.default_model() == "grok-imagine-video" assert provider.default_model() == "grok-imagine-video"
def test_xai_provider_lists_text_and_current_image_video_models():
from plugins.video_gen.xai import XAIVideoGenProvider
models = XAIVideoGenProvider().list_models()
ids = [model["id"] for model in models]
assert ids[0] == "grok-imagine-video"
assert ids[1] == "grok-imagine-video-1.5-preview"
assert models[1]["modalities"] == ["image"]
assert models[1]["aliases"] == ["grok-imagine-video-1.5-2026-05-30"]
def test_xai_routes_default_models_by_modality():
from plugins.video_gen.xai import _resolve_model_for_modality
assert _resolve_model_for_modality(
"grok-imagine-video",
modality="text",
explicit_model=False,
) == "grok-imagine-video"
assert _resolve_model_for_modality(
"grok-imagine-video",
modality="image",
explicit_model=False,
) == "grok-imagine-video-1.5-preview"
assert _resolve_model_for_modality(
"grok-imagine-video-1.5-preview",
modality="text",
explicit_model=False,
) == "grok-imagine-video"
assert _resolve_model_for_modality(
"grok-imagine-video-1.5-preview",
modality="text",
explicit_model=True,
) == "grok-imagine-video-1.5-preview"
def test_xai_capabilities_text_and_image_only(): def test_xai_capabilities_text_and_image_only():
"""xAI was previously advertised with edit/extend operations. The """xAI was previously advertised with edit/extend operations. The
simplified surface only exposes text-to-video and image-to-video — simplified surface only exposes text-to-video and image-to-video —

View File

@ -56,7 +56,7 @@ class _FakeAsyncClient:
return _FakeResponse(200, { return _FakeResponse(200, {
"status": "done", "status": "done",
"video": {"url": "https://xai-cdn/out.mp4", "duration": 8}, "video": {"url": "https://xai-cdn/out.mp4", "duration": 8},
"model": "grok-imagine-video", "model": self.posts[-1]["json"]["model"],
}) })
@ -113,6 +113,7 @@ class TestXAIPayload:
provider, captured = xai_provider provider, captured = xai_provider
provider.generate("a dog at sunset") provider.generate("a dog at sunset")
payload = _last_post(captured)["json"] payload = _last_post(captured)["json"]
assert payload["model"] == "grok-imagine-video"
assert payload["prompt"] == "a dog at sunset" assert payload["prompt"] == "a dog at sunset"
assert "image" not in payload assert "image" not in payload
assert "reference_images" not in payload assert "reference_images" not in payload
@ -121,8 +122,31 @@ class TestXAIPayload:
provider, captured = xai_provider provider, captured = xai_provider
provider.generate("animate this", image_url="https://example.com/cat.png") provider.generate("animate this", image_url="https://example.com/cat.png")
payload = _last_post(captured)["json"] payload = _last_post(captured)["json"]
assert payload["model"] == "grok-imagine-video-1.5-preview"
assert payload["image"] == {"url": "https://example.com/cat.png"} assert payload["image"] == {"url": "https://example.com/cat.png"}
def test_local_image_path_is_sent_as_data_uri(self, xai_provider, tmp_path):
provider, captured = xai_provider
image_path = tmp_path / "frame.png"
image_path.write_bytes(b"\x89PNG\r\n\x1a\nfake")
provider.generate("animate this", image_url=str(image_path))
payload = _last_post(captured)["json"]
assert payload["model"] == "grok-imagine-video-1.5-preview"
assert payload["image"]["url"].startswith("data:image/png;base64,")
def test_explicit_model_override_is_honored_for_image(self, xai_provider):
provider, captured = xai_provider
provider.generate(
"animate this",
image_url="https://example.com/cat.png",
model="grok-imagine-video",
_model_override_explicit=True,
)
payload = _last_post(captured)["json"]
assert payload["model"] == "grok-imagine-video"
def test_reference_images_payload(self, xai_provider): def test_reference_images_payload(self, xai_provider):
provider, captured = xai_provider provider, captured = xai_provider
provider.generate( provider.generate(

View File

@ -82,7 +82,7 @@ def matrix_env(tmp_path, monkeypatch):
return _Resp({ return _Resp({
"status": "done", "status": "done",
"video": {"url": "https://xai-cdn/out.mp4", "duration": 8}, "video": {"url": "https://xai-cdn/out.mp4", "duration": 8},
"model": "grok-imagine-video", "model": xai_calls[-1]["json"].get("model", "grok-imagine-video"),
}) })
import plugins.video_gen.xai as xai_plugin import plugins.video_gen.xai as xai_plugin
monkeypatch.setattr(xai_plugin.httpx, "AsyncClient", lambda: _Client()) monkeypatch.setattr(xai_plugin.httpx, "AsyncClient", lambda: _Client())
@ -202,6 +202,7 @@ def test_xai_text_only_via_tool_surface(matrix_env):
assert len(xai_calls) == 1 assert len(xai_calls) == 1
assert xai_calls[0]["url"].endswith("/videos/generations") assert xai_calls[0]["url"].endswith("/videos/generations")
payload = xai_calls[0]["json"] or {} payload = xai_calls[0]["json"] or {}
assert payload["model"] == "grok-imagine-video"
assert "image" not in payload assert "image" not in payload
assert "reference_images" not in payload assert "reference_images" not in payload
@ -221,6 +222,26 @@ def test_xai_text_plus_image_via_tool_surface(matrix_env):
assert len(xai_calls) == 1 assert len(xai_calls) == 1
assert xai_calls[0]["url"].endswith("/videos/generations") assert xai_calls[0]["url"].endswith("/videos/generations")
payload = xai_calls[0]["json"] or {} payload = xai_calls[0]["json"] or {}
assert payload["model"] == "grok-imagine-video-1.5-preview"
assert payload["image"] == {"url": "https://example.com/img.png"}
def test_xai_explicit_model_override_via_tool_surface(matrix_env):
home, _, xai_calls = matrix_env
result = _invoke_tool(
home,
{"video_gen": {"provider": "xai"}},
{
"prompt": "animate this",
"image_url": "https://example.com/img.png",
"model": "grok-imagine-video",
},
)
assert result["success"] is True
payload = xai_calls[0]["json"] or {}
assert payload["model"] == "grok-imagine-video"
assert payload["image"] == {"url": "https://example.com/img.png"} assert payload["image"] == {"url": "https://example.com/img.png"}

View File

@ -336,6 +336,7 @@ def _handle_video_generate(args: Dict[str, Any], **_kw: Any) -> str:
kwargs: Dict[str, Any] = { kwargs: Dict[str, Any] = {
"model": model, "model": model,
"_model_override_explicit": bool(model_override),
"image_url": image_url, "image_url": image_url,
"reference_image_urls": reference_image_urls, "reference_image_urls": reference_image_urls,
"duration": duration, "duration": duration,

View File

@ -182,7 +182,8 @@ The `x_search` toolset auto-enables whenever xAI credentials (a SuperGrok / X Pr
| Chat | `grok-4.20-multi-agent-0309` | Multi-agent variant | | Chat | `grok-4.20-multi-agent-0309` | Multi-agent variant |
| Image | `grok-imagine-image` | Default; ~510 s | | Image | `grok-imagine-image` | Default; ~510 s |
| Image | `grok-imagine-image-quality` | Higher fidelity; ~1020 s | | Image | `grok-imagine-image-quality` | Higher fidelity; ~1020 s |
| Video | `grok-imagine-video` | Text-to-video and image-to-video; up to 7 reference images | | Video | `grok-imagine-video` | Text-to-video |
| Video | `grok-imagine-video-1.5-preview` | Image-to-video; dated alias `grok-imagine-video-1.5-2026-05-30` |
| TTS | (default voice) | xAI `/v1/tts` endpoint | | TTS | (default voice) | xAI `/v1/tts` endpoint |
The chat catalog is derived live from the on-disk `models.dev` cache; new xAI releases appear automatically once that cache refreshes. `grok-4.3` is always pinned to the top of the list. The chat catalog is derived live from the on-disk `models.dev` cache; new xAI releases appear automatically once that cache refreshes. `grok-4.3` is always pinned to the top of the list.

View File

@ -180,7 +180,8 @@ hermes tools
| 对话 | `grok-4.20-multi-agent-0309` | 多 agent 变体 | | 对话 | `grok-4.20-multi-agent-0309` | 多 agent 变体 |
| 图像 | `grok-imagine-image` | 默认;约 510 秒 | | 图像 | `grok-imagine-image` | 默认;约 510 秒 |
| 图像 | `grok-imagine-image-quality` | 更高保真度;约 1020 秒 | | 图像 | `grok-imagine-image-quality` | 更高保真度;约 1020 秒 |
| 视频 | `grok-imagine-video` | 文本转视频和图像转视频;最多 7 张参考图像 | | 视频 | `grok-imagine-video` | 文本转视频 |
| 视频 | `grok-imagine-video-1.5-preview` | 图像转视频;日期别名 `grok-imagine-video-1.5-2026-05-30` |
| TTS | (默认音色) | xAI `/v1/tts` 端点 | | TTS | (默认音色) | xAI `/v1/tts` 端点 |
对话模型目录从磁盘上的 `models.dev` 缓存实时获取;缓存刷新后,新的 xAI 模型会自动出现。`grok-4.3` 始终固定在列表顶部。 对话模型目录从磁盘上的 `models.dev` 缓存实时获取;缓存刷新后,新的 xAI 模型会自动出现。`grok-4.3` 始终固定在列表顶部。
@ -266,4 +267,4 @@ hermes auth logout xai-oauth
- [AI Providers 参考](../integrations/providers.md) - [AI Providers 参考](../integrations/providers.md)
- [环境变量](../reference/environment-variables.md) - [环境变量](../reference/environment-variables.md)
- [配置](../user-guide/configuration.md) - [配置](../user-guide/configuration.md)
- [语音与 TTS](../user-guide/features/tts.md) - [语音与 TTS](../user-guide/features/tts.md)