fix(xai): route video models by modality

This commit is contained in:
Julien Talbot
2026-05-31 10:49:59 +04:00
committed by Teknium
parent eee32cdd52
commit 8104b20269
8 changed files with 167 additions and 19 deletions

View File

@ -21,9 +21,12 @@ delivers it.
from __future__ import annotations
import asyncio
import base64
import logging
import mimetypes
import os
import uuid
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import httpx
@ -42,7 +45,9 @@ logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
DEFAULT_XAI_BASE_URL = "https://api.x.ai/v1"
DEFAULT_MODEL = "grok-imagine-video"
DEFAULT_TEXT_TO_VIDEO_MODEL = "grok-imagine-video"
DEFAULT_IMAGE_TO_VIDEO_MODEL = "grok-imagine-video-1.5-preview"
DEFAULT_MODEL = DEFAULT_TEXT_TO_VIDEO_MODEL
DEFAULT_DURATION = 8
DEFAULT_ASPECT_RATIO = "16:9"
DEFAULT_RESOLUTION = "720p"
@ -58,10 +63,18 @@ _MODELS: Dict[str, Dict[str, Any]] = {
"grok-imagine-video": {
"display": "Grok Imagine Video",
"speed": "~60-240s",
"strengths": "Text-to-video + image-to-video; up to 7 reference images for style/character.",
"price": "see https://docs.x.ai/docs/models",
"strengths": "Text-to-video; legacy image-to-video fallback.",
"price": "see https://docs.x.ai/developers/models/grok-imagine-video",
"modalities": ["text", "image"],
},
"grok-imagine-video-1.5-preview": {
"display": "Grok Imagine Video 1.5 Preview",
"speed": "~60-240s",
"strengths": "Latest xAI image-to-video model.",
"price": "see https://docs.x.ai/developers/models/grok-imagine-video-1.5-preview",
"modalities": ["image"],
"aliases": ["grok-imagine-video-1.5-2026-05-30"],
},
}
@ -111,10 +124,31 @@ def _xai_headers(api_key: str) -> Dict[str, str]:
}
def _image_ref_to_xai_url(value: str) -> str:
"""Return a URL/data URI accepted by xAI for image inputs."""
ref = (value or "").strip()
if not ref:
return ""
lower = ref.lower()
if lower.startswith(("http://", "https://", "data:image/")):
return ref
path = Path(ref).expanduser()
if not path.is_file():
return ref
mime = mimetypes.guess_type(path.name)[0] or "application/octet-stream"
if not mime.startswith("image/"):
return ref
encoded = base64.b64encode(path.read_bytes()).decode("ascii")
return f"data:{mime};base64,{encoded}"
def _normalize_reference_images(reference_image_urls: Optional[List[str]]):
refs = []
for url in reference_image_urls or []:
normalized = (url or "").strip()
normalized = _image_ref_to_xai_url(url)
if normalized:
refs.append({"url": normalized})
return refs or None
@ -131,6 +165,28 @@ def _clamp_duration(duration: Optional[int], has_reference_images: bool) -> int:
return value
def _resolve_model_for_modality(
model: Optional[str],
*,
modality: str,
explicit_model: bool,
) -> str:
"""Select xAI's text/video model without treating config as a prompt override.
``grok-imagine-video-1.5-preview`` currently rejects text-only video
generation, but it is the desired image-to-video backend. Explicit tool
``model=`` still wins for users who intentionally request another model.
"""
requested = (model or "").strip()
if explicit_model and requested:
return requested
if modality == "image":
return DEFAULT_IMAGE_TO_VIDEO_MODEL
if requested == DEFAULT_IMAGE_TO_VIDEO_MODEL:
return DEFAULT_TEXT_TO_VIDEO_MODEL
return requested or DEFAULT_TEXT_TO_VIDEO_MODEL
async def _submit(
client: httpx.AsyncClient,
payload: Dict[str, Any],
@ -192,7 +248,7 @@ async def _poll(
class XAIVideoGenProvider(VideoGenProvider):
"""xAI grok-imagine-video backend (text-to-video + image-to-video)."""
"""xAI Grok Imagine video backend (text-to-video + image-to-video)."""
@property
def name(self) -> str:
@ -222,7 +278,7 @@ class XAIVideoGenProvider(VideoGenProvider):
return {
"name": "xAI Grok Imagine",
"badge": "paid",
"tag": "grok-imagine-video text-to-video & image-to-video; uses xAI Grok OAuth or XAI_API_KEY",
"tag": "grok-imagine-video for text-to-video; grok-imagine-video-1.5-preview for image-to-video; uses xAI Grok OAuth or XAI_API_KEY",
"env_vars": [],
"post_setup": "xai_grok",
}
@ -260,6 +316,7 @@ class XAIVideoGenProvider(VideoGenProvider):
return loop.run_until_complete(self._generate_async(
prompt=prompt,
model=model,
explicit_model=bool(kwargs.get("_model_override_explicit")),
image_url=image_url,
reference_image_urls=reference_image_urls,
duration=duration,
@ -284,6 +341,7 @@ class XAIVideoGenProvider(VideoGenProvider):
*,
prompt: str,
model: Optional[str],
explicit_model: bool,
image_url: Optional[str],
reference_image_urls: Optional[List[str]],
duration: Optional[int],
@ -303,10 +361,15 @@ class XAIVideoGenProvider(VideoGenProvider):
)
prompt = (prompt or "").strip()
image_url_norm = (image_url or "").strip() or None
image_url_norm = _image_ref_to_xai_url(image_url or "") or None
normalized_aspect_ratio = (aspect_ratio or DEFAULT_ASPECT_RATIO).strip()
normalized_resolution = (resolution or DEFAULT_RESOLUTION).strip().lower()
modality_used = "image" if image_url_norm else "text"
resolved_model = _resolve_model_for_modality(
model,
modality=modality_used,
explicit_model=explicit_model,
)
if not prompt:
return error_response(
@ -340,7 +403,7 @@ class XAIVideoGenProvider(VideoGenProvider):
normalized_resolution = DEFAULT_RESOLUTION
payload: Dict[str, Any] = {
"model": model or DEFAULT_MODEL,
"model": resolved_model,
"prompt": prompt,
"duration": clamped_duration,
"aspect_ratio": normalized_aspect_ratio,
@ -366,7 +429,7 @@ class XAIVideoGenProvider(VideoGenProvider):
error=f"xAI submit failed ({exc.response.status_code}): {detail or exc}",
error_type="api_error",
provider="xai",
model=model or DEFAULT_MODEL,
model=resolved_model,
prompt=prompt,
)
@ -388,7 +451,7 @@ class XAIVideoGenProvider(VideoGenProvider):
error="xAI video generation completed without a video URL",
error_type="empty_response",
provider="xai",
model=body.get("model") or model or DEFAULT_MODEL,
model=body.get("model") or resolved_model,
prompt=prompt,
)
extra: Dict[str, Any] = {
@ -399,7 +462,7 @@ class XAIVideoGenProvider(VideoGenProvider):
extra["usage"] = body["usage"]
return success_response(
video=url,
model=body.get("model") or model or DEFAULT_MODEL,
model=body.get("model") or resolved_model,
prompt=prompt,
modality=modality_used,
aspect_ratio=normalized_aspect_ratio,
@ -413,7 +476,7 @@ class XAIVideoGenProvider(VideoGenProvider):
error=f"Timed out waiting for video generation after {DEFAULT_TIMEOUT_SECONDS}s",
error_type="timeout",
provider="xai",
model=model or DEFAULT_MODEL,
model=resolved_model,
prompt=prompt,
)
@ -426,7 +489,7 @@ class XAIVideoGenProvider(VideoGenProvider):
error=message,
error_type=f"xai_{status}",
provider="xai",
model=model or DEFAULT_MODEL,
model=resolved_model,
prompt=prompt,
)

View File

@ -1,6 +1,6 @@
name: xai
version: 1.0.0
description: "xAI Grok-Imagine video generation backend. Supports text-to-video, image-to-video, reference-image-guided generation, video edit, and video extend via the xAI async videos API."
description: "xAI Grok Imagine video generation backend. Supports text-to-video, image-to-video, and reference-image-guided generation via the xAI async videos API."
author: NousResearch
kind: backend
requires_env:

View File

@ -25,6 +25,43 @@ def test_xai_provider_registers():
assert provider.default_model() == "grok-imagine-video"
def test_xai_provider_lists_text_and_current_image_video_models():
from plugins.video_gen.xai import XAIVideoGenProvider
models = XAIVideoGenProvider().list_models()
ids = [model["id"] for model in models]
assert ids[0] == "grok-imagine-video"
assert ids[1] == "grok-imagine-video-1.5-preview"
assert models[1]["modalities"] == ["image"]
assert models[1]["aliases"] == ["grok-imagine-video-1.5-2026-05-30"]
def test_xai_routes_default_models_by_modality():
from plugins.video_gen.xai import _resolve_model_for_modality
assert _resolve_model_for_modality(
"grok-imagine-video",
modality="text",
explicit_model=False,
) == "grok-imagine-video"
assert _resolve_model_for_modality(
"grok-imagine-video",
modality="image",
explicit_model=False,
) == "grok-imagine-video-1.5-preview"
assert _resolve_model_for_modality(
"grok-imagine-video-1.5-preview",
modality="text",
explicit_model=False,
) == "grok-imagine-video"
assert _resolve_model_for_modality(
"grok-imagine-video-1.5-preview",
modality="text",
explicit_model=True,
) == "grok-imagine-video-1.5-preview"
def test_xai_capabilities_text_and_image_only():
"""xAI was previously advertised with edit/extend operations. The
simplified surface only exposes text-to-video and image-to-video —

View File

@ -56,7 +56,7 @@ class _FakeAsyncClient:
return _FakeResponse(200, {
"status": "done",
"video": {"url": "https://xai-cdn/out.mp4", "duration": 8},
"model": "grok-imagine-video",
"model": self.posts[-1]["json"]["model"],
})
@ -113,6 +113,7 @@ class TestXAIPayload:
provider, captured = xai_provider
provider.generate("a dog at sunset")
payload = _last_post(captured)["json"]
assert payload["model"] == "grok-imagine-video"
assert payload["prompt"] == "a dog at sunset"
assert "image" not in payload
assert "reference_images" not in payload
@ -121,8 +122,31 @@ class TestXAIPayload:
provider, captured = xai_provider
provider.generate("animate this", image_url="https://example.com/cat.png")
payload = _last_post(captured)["json"]
assert payload["model"] == "grok-imagine-video-1.5-preview"
assert payload["image"] == {"url": "https://example.com/cat.png"}
def test_local_image_path_is_sent_as_data_uri(self, xai_provider, tmp_path):
provider, captured = xai_provider
image_path = tmp_path / "frame.png"
image_path.write_bytes(b"\x89PNG\r\n\x1a\nfake")
provider.generate("animate this", image_url=str(image_path))
payload = _last_post(captured)["json"]
assert payload["model"] == "grok-imagine-video-1.5-preview"
assert payload["image"]["url"].startswith("data:image/png;base64,")
def test_explicit_model_override_is_honored_for_image(self, xai_provider):
provider, captured = xai_provider
provider.generate(
"animate this",
image_url="https://example.com/cat.png",
model="grok-imagine-video",
_model_override_explicit=True,
)
payload = _last_post(captured)["json"]
assert payload["model"] == "grok-imagine-video"
def test_reference_images_payload(self, xai_provider):
provider, captured = xai_provider
provider.generate(

View File

@ -82,7 +82,7 @@ def matrix_env(tmp_path, monkeypatch):
return _Resp({
"status": "done",
"video": {"url": "https://xai-cdn/out.mp4", "duration": 8},
"model": "grok-imagine-video",
"model": xai_calls[-1]["json"].get("model", "grok-imagine-video"),
})
import plugins.video_gen.xai as xai_plugin
monkeypatch.setattr(xai_plugin.httpx, "AsyncClient", lambda: _Client())
@ -202,6 +202,7 @@ def test_xai_text_only_via_tool_surface(matrix_env):
assert len(xai_calls) == 1
assert xai_calls[0]["url"].endswith("/videos/generations")
payload = xai_calls[0]["json"] or {}
assert payload["model"] == "grok-imagine-video"
assert "image" not in payload
assert "reference_images" not in payload
@ -221,6 +222,26 @@ def test_xai_text_plus_image_via_tool_surface(matrix_env):
assert len(xai_calls) == 1
assert xai_calls[0]["url"].endswith("/videos/generations")
payload = xai_calls[0]["json"] or {}
assert payload["model"] == "grok-imagine-video-1.5-preview"
assert payload["image"] == {"url": "https://example.com/img.png"}
def test_xai_explicit_model_override_via_tool_surface(matrix_env):
home, _, xai_calls = matrix_env
result = _invoke_tool(
home,
{"video_gen": {"provider": "xai"}},
{
"prompt": "animate this",
"image_url": "https://example.com/img.png",
"model": "grok-imagine-video",
},
)
assert result["success"] is True
payload = xai_calls[0]["json"] or {}
assert payload["model"] == "grok-imagine-video"
assert payload["image"] == {"url": "https://example.com/img.png"}

View File

@ -336,6 +336,7 @@ def _handle_video_generate(args: Dict[str, Any], **_kw: Any) -> str:
kwargs: Dict[str, Any] = {
"model": model,
"_model_override_explicit": bool(model_override),
"image_url": image_url,
"reference_image_urls": reference_image_urls,
"duration": duration,

View File

@ -182,7 +182,8 @@ The `x_search` toolset auto-enables whenever xAI credentials (a SuperGrok / X Pr
| Chat | `grok-4.20-multi-agent-0309` | Multi-agent variant |
| Image | `grok-imagine-image` | Default; ~510 s |
| Image | `grok-imagine-image-quality` | Higher fidelity; ~1020 s |
| Video | `grok-imagine-video` | Text-to-video and image-to-video; up to 7 reference images |
| Video | `grok-imagine-video` | Text-to-video |
| Video | `grok-imagine-video-1.5-preview` | Image-to-video; dated alias `grok-imagine-video-1.5-2026-05-30` |
| TTS | (default voice) | xAI `/v1/tts` endpoint |
The chat catalog is derived live from the on-disk `models.dev` cache; new xAI releases appear automatically once that cache refreshes. `grok-4.3` is always pinned to the top of the list.

View File

@ -180,7 +180,8 @@ hermes tools
| 对话 | `grok-4.20-multi-agent-0309` | 多 agent 变体 |
| 图像 | `grok-imagine-image` | 默认;约 510 秒 |
| 图像 | `grok-imagine-image-quality` | 更高保真度;约 1020 秒 |
| 视频 | `grok-imagine-video` | 文本转视频和图像转视频;最多 7 张参考图像 |
| 视频 | `grok-imagine-video` | 文本转视频 |
| 视频 | `grok-imagine-video-1.5-preview` | 图像转视频;日期别名 `grok-imagine-video-1.5-2026-05-30` |
| TTS | (默认音色) | xAI `/v1/tts` 端点 |
对话模型目录从磁盘上的 `models.dev` 缓存实时获取;缓存刷新后,新的 xAI 模型会自动出现。`grok-4.3` 始终固定在列表顶部。