fix(xai): route video models by modality

2026-05-31 10:49:59 +04:00
parent eee32cdd52
commit 8104b20269
8 changed files with 167 additions and 19 deletions
--- a/plugins/video_gen/xai/init.py
+++ b/plugins/video_gen/xai/init.py
@ -21,9 +21,12 @@ delivers it.
 from __future__ import annotations

 import asyncio
+import base64
 import logging
+import mimetypes
 import os
 import uuid
+from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple

 import httpx
@ -42,7 +45,9 @@ logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------

 DEFAULT_XAI_BASE_URL = "https://api.x.ai/v1"
-DEFAULT_MODEL = "grok-imagine-video"
+DEFAULT_TEXT_TO_VIDEO_MODEL = "grok-imagine-video"
+DEFAULT_IMAGE_TO_VIDEO_MODEL = "grok-imagine-video-1.5-preview"
+DEFAULT_MODEL = DEFAULT_TEXT_TO_VIDEO_MODEL
 DEFAULT_DURATION = 8
 DEFAULT_ASPECT_RATIO = "16:9"
 DEFAULT_RESOLUTION = "720p"
@ -58,10 +63,18 @@ _MODELS: Dict[str, Dict[str, Any]] = {
    "grok-imagine-video": {
        "display": "Grok Imagine Video",
        "speed": "~60-240s",
-        "strengths": "Text-to-video + image-to-video; up to 7 reference images for style/character.",
-        "price": "see https://docs.x.ai/docs/models",
+        "strengths": "Text-to-video; legacy image-to-video fallback.",
+        "price": "see https://docs.x.ai/developers/models/grok-imagine-video",
        "modalities": ["text", "image"],
    },
+    "grok-imagine-video-1.5-preview": {
+        "display": "Grok Imagine Video 1.5 Preview",
+        "speed": "~60-240s",
+        "strengths": "Latest xAI image-to-video model.",
+        "price": "see https://docs.x.ai/developers/models/grok-imagine-video-1.5-preview",
+        "modalities": ["image"],
+        "aliases": ["grok-imagine-video-1.5-2026-05-30"],
+    },
 }


@ -111,10 +124,31 @@ def _xai_headers(api_key: str) -> Dict[str, str]:
    }


+def _image_ref_to_xai_url(value: str) -> str:
+    """Return a URL/data URI accepted by xAI for image inputs."""
+    ref = (value or "").strip()
+    if not ref:
+        return ""
+    lower = ref.lower()
+    if lower.startswith(("http://", "https://", "data:image/")):
+        return ref
+
+    path = Path(ref).expanduser()
+    if not path.is_file():
+        return ref
+
+    mime = mimetypes.guess_type(path.name)[0] or "application/octet-stream"
+    if not mime.startswith("image/"):
+        return ref
+
+    encoded = base64.b64encode(path.read_bytes()).decode("ascii")
+    return f"data:{mime};base64,{encoded}"
+
+
 def _normalize_reference_images(reference_image_urls: Optional[List[str]]):
    refs = []
    for url in reference_image_urls or []:
-        normalized = (url or "").strip()
+        normalized = _image_ref_to_xai_url(url)
        if normalized:
            refs.append({"url": normalized})
    return refs or None
@ -131,6 +165,28 @@ def _clamp_duration(duration: Optional[int], has_reference_images: bool) -> int:
    return value


+def _resolve_model_for_modality(
+    model: Optional[str],
+    *,
+    modality: str,
+    explicit_model: bool,
+) -> str:
+    """Select xAI's text/video model without treating config as a prompt override.
+
+    ``grok-imagine-video-1.5-preview`` currently rejects text-only video
+    generation, but it is the desired image-to-video backend. Explicit tool
+    ``model=`` still wins for users who intentionally request another model.
+    """
+    requested = (model or "").strip()
+    if explicit_model and requested:
+        return requested
+    if modality == "image":
+        return DEFAULT_IMAGE_TO_VIDEO_MODEL
+    if requested == DEFAULT_IMAGE_TO_VIDEO_MODEL:
+        return DEFAULT_TEXT_TO_VIDEO_MODEL
+    return requested or DEFAULT_TEXT_TO_VIDEO_MODEL
+
+
 async def _submit(
    client: httpx.AsyncClient,
    payload: Dict[str, Any],
@ -192,7 +248,7 @@ async def _poll(


 class XAIVideoGenProvider(VideoGenProvider):
-    """xAI grok-imagine-video backend (text-to-video + image-to-video)."""
+    """xAI Grok Imagine video backend (text-to-video + image-to-video)."""

    @property
    def name(self) -> str:
@ -222,7 +278,7 @@ class XAIVideoGenProvider(VideoGenProvider):
        return {
            "name": "xAI Grok Imagine",
            "badge": "paid",
-            "tag": "grok-imagine-video — text-to-video & image-to-video; uses xAI Grok OAuth or XAI_API_KEY",
+            "tag": "grok-imagine-video for text-to-video; grok-imagine-video-1.5-preview for image-to-video; uses xAI Grok OAuth or XAI_API_KEY",
            "env_vars": [],
            "post_setup": "xai_grok",
        }
@ -260,6 +316,7 @@ class XAIVideoGenProvider(VideoGenProvider):
                return loop.run_until_complete(self._generate_async(
                    prompt=prompt,
                    model=model,
+                    explicit_model=bool(kwargs.get("_model_override_explicit")),
                    image_url=image_url,
                    reference_image_urls=reference_image_urls,
                    duration=duration,
@ -284,6 +341,7 @@ class XAIVideoGenProvider(VideoGenProvider):
        *,
        prompt: str,
        model: Optional[str],
+        explicit_model: bool,
        image_url: Optional[str],
        reference_image_urls: Optional[List[str]],
        duration: Optional[int],
@ -303,10 +361,15 @@ class XAIVideoGenProvider(VideoGenProvider):
            )

        prompt = (prompt or "").strip()
-        image_url_norm = (image_url or "").strip() or None
+        image_url_norm = _image_ref_to_xai_url(image_url or "") or None
        normalized_aspect_ratio = (aspect_ratio or DEFAULT_ASPECT_RATIO).strip()
        normalized_resolution = (resolution or DEFAULT_RESOLUTION).strip().lower()
        modality_used = "image" if image_url_norm else "text"
+        resolved_model = _resolve_model_for_modality(
+            model,
+            modality=modality_used,
+            explicit_model=explicit_model,
+        )

        if not prompt:
            return error_response(
@ -340,7 +403,7 @@ class XAIVideoGenProvider(VideoGenProvider):
            normalized_resolution = DEFAULT_RESOLUTION

        payload: Dict[str, Any] = {
-            "model": model or DEFAULT_MODEL,
+            "model": resolved_model,
            "prompt": prompt,
            "duration": clamped_duration,
            "aspect_ratio": normalized_aspect_ratio,
@ -366,7 +429,7 @@ class XAIVideoGenProvider(VideoGenProvider):
                    error=f"xAI submit failed ({exc.response.status_code}): {detail or exc}",
                    error_type="api_error",
                    provider="xai",
-                    model=model or DEFAULT_MODEL,
+                    model=resolved_model,
                    prompt=prompt,
                )

@ -388,7 +451,7 @@ class XAIVideoGenProvider(VideoGenProvider):
                    error="xAI video generation completed without a video URL",
                    error_type="empty_response",
                    provider="xai",
-                    model=body.get("model") or model or DEFAULT_MODEL,
+                    model=body.get("model") or resolved_model,
                    prompt=prompt,
                )
            extra: Dict[str, Any] = {
@ -399,7 +462,7 @@ class XAIVideoGenProvider(VideoGenProvider):
                extra["usage"] = body["usage"]
            return success_response(
                video=url,
-                model=body.get("model") or model or DEFAULT_MODEL,
+                model=body.get("model") or resolved_model,
                prompt=prompt,
                modality=modality_used,
                aspect_ratio=normalized_aspect_ratio,
@ -413,7 +476,7 @@ class XAIVideoGenProvider(VideoGenProvider):
                error=f"Timed out waiting for video generation after {DEFAULT_TIMEOUT_SECONDS}s",
                error_type="timeout",
                provider="xai",
-                model=model or DEFAULT_MODEL,
+                model=resolved_model,
                prompt=prompt,
            )

@ -426,7 +489,7 @@ class XAIVideoGenProvider(VideoGenProvider):
            error=message,
            error_type=f"xai_{status}",
            provider="xai",
-            model=model or DEFAULT_MODEL,
+            model=resolved_model,
            prompt=prompt,
        )

--- a/plugins/video_gen/xai/plugin.yaml
+++ b/plugins/video_gen/xai/plugin.yaml
@ -1,6 +1,6 @@
 name: xai
 version: 1.0.0
-description: "xAI Grok-Imagine video generation backend. Supports text-to-video, image-to-video, reference-image-guided generation, video edit, and video extend via the xAI async videos API."
+description: "xAI Grok Imagine video generation backend. Supports text-to-video, image-to-video, and reference-image-guided generation via the xAI async videos API."
 author: NousResearch
 kind: backend
 requires_env:
--- a/tests/plugins/video_gen/test_xai_plugin.py
+++ b/tests/plugins/video_gen/test_xai_plugin.py
@ -25,6 +25,43 @@ def test_xai_provider_registers():
    assert provider.default_model() == "grok-imagine-video"


+def test_xai_provider_lists_text_and_current_image_video_models():
+    from plugins.video_gen.xai import XAIVideoGenProvider
+
+    models = XAIVideoGenProvider().list_models()
+    ids = [model["id"] for model in models]
+
+    assert ids[0] == "grok-imagine-video"
+    assert ids[1] == "grok-imagine-video-1.5-preview"
+    assert models[1]["modalities"] == ["image"]
+    assert models[1]["aliases"] == ["grok-imagine-video-1.5-2026-05-30"]
+
+
+def test_xai_routes_default_models_by_modality():
+    from plugins.video_gen.xai import _resolve_model_for_modality
+
+    assert _resolve_model_for_modality(
+        "grok-imagine-video",
+        modality="text",
+        explicit_model=False,
+    ) == "grok-imagine-video"
+    assert _resolve_model_for_modality(
+        "grok-imagine-video",
+        modality="image",
+        explicit_model=False,
+    ) == "grok-imagine-video-1.5-preview"
+    assert _resolve_model_for_modality(
+        "grok-imagine-video-1.5-preview",
+        modality="text",
+        explicit_model=False,
+    ) == "grok-imagine-video"
+    assert _resolve_model_for_modality(
+        "grok-imagine-video-1.5-preview",
+        modality="text",
+        explicit_model=True,
+    ) == "grok-imagine-video-1.5-preview"
+
+
 def test_xai_capabilities_text_and_image_only():
    """xAI was previously advertised with edit/extend operations. The
    simplified surface only exposes text-to-video and image-to-video —
--- a/tests/plugins/video_gen/test_xai_plugin_integration.py
+++ b/tests/plugins/video_gen/test_xai_plugin_integration.py
@ -56,7 +56,7 @@ class _FakeAsyncClient:
        return _FakeResponse(200, {
            "status": "done",
            "video": {"url": "https://xai-cdn/out.mp4", "duration": 8},
-            "model": "grok-imagine-video",
+            "model": self.posts[-1]["json"]["model"],
        })


@ -113,6 +113,7 @@ class TestXAIPayload:
        provider, captured = xai_provider
        provider.generate("a dog at sunset")
        payload = _last_post(captured)["json"]
+        assert payload["model"] == "grok-imagine-video"
        assert payload["prompt"] == "a dog at sunset"
        assert "image" not in payload
        assert "reference_images" not in payload
@ -121,8 +122,31 @@ class TestXAIPayload:
        provider, captured = xai_provider
        provider.generate("animate this", image_url="https://example.com/cat.png")
        payload = _last_post(captured)["json"]
+        assert payload["model"] == "grok-imagine-video-1.5-preview"
        assert payload["image"] == {"url": "https://example.com/cat.png"}

+    def test_local_image_path_is_sent_as_data_uri(self, xai_provider, tmp_path):
+        provider, captured = xai_provider
+        image_path = tmp_path / "frame.png"
+        image_path.write_bytes(b"\x89PNG\r\n\x1a\nfake")
+
+        provider.generate("animate this", image_url=str(image_path))
+
+        payload = _last_post(captured)["json"]
+        assert payload["model"] == "grok-imagine-video-1.5-preview"
+        assert payload["image"]["url"].startswith("data:image/png;base64,")
+
+    def test_explicit_model_override_is_honored_for_image(self, xai_provider):
+        provider, captured = xai_provider
+        provider.generate(
+            "animate this",
+            image_url="https://example.com/cat.png",
+            model="grok-imagine-video",
+            _model_override_explicit=True,
+        )
+        payload = _last_post(captured)["json"]
+        assert payload["model"] == "grok-imagine-video"
+
    def test_reference_images_payload(self, xai_provider):
        provider, captured = xai_provider
        provider.generate(
--- a/tests/tools/test_video_generation_tool_surface_matrix.py
+++ b/tests/tools/test_video_generation_tool_surface_matrix.py
@ -82,7 +82,7 @@ def matrix_env(tmp_path, monkeypatch):
            return _Resp({
                "status": "done",
                "video": {"url": "https://xai-cdn/out.mp4", "duration": 8},
-                "model": "grok-imagine-video",
+                "model": xai_calls[-1]["json"].get("model", "grok-imagine-video"),
            })
    import plugins.video_gen.xai as xai_plugin
    monkeypatch.setattr(xai_plugin.httpx, "AsyncClient", lambda: _Client())
@ -202,6 +202,7 @@ def test_xai_text_only_via_tool_surface(matrix_env):
    assert len(xai_calls) == 1
    assert xai_calls[0]["url"].endswith("/videos/generations")
    payload = xai_calls[0]["json"] or {}
+    assert payload["model"] == "grok-imagine-video"
    assert "image" not in payload
    assert "reference_images" not in payload

@ -221,6 +222,26 @@ def test_xai_text_plus_image_via_tool_surface(matrix_env):
    assert len(xai_calls) == 1
    assert xai_calls[0]["url"].endswith("/videos/generations")
    payload = xai_calls[0]["json"] or {}
+    assert payload["model"] == "grok-imagine-video-1.5-preview"
+    assert payload["image"] == {"url": "https://example.com/img.png"}
+
+
+def test_xai_explicit_model_override_via_tool_surface(matrix_env):
+    home, _, xai_calls = matrix_env
+
+    result = _invoke_tool(
+        home,
+        {"video_gen": {"provider": "xai"}},
+        {
+            "prompt": "animate this",
+            "image_url": "https://example.com/img.png",
+            "model": "grok-imagine-video",
+        },
+    )
+    assert result["success"] is True
+
+    payload = xai_calls[0]["json"] or {}
+    assert payload["model"] == "grok-imagine-video"
    assert payload["image"] == {"url": "https://example.com/img.png"}


--- a/tools/video_generation_tool.py
+++ b/tools/video_generation_tool.py
@ -336,6 +336,7 @@ def _handle_video_generate(args: Dict[str, Any], **_kw: Any) -> str:

    kwargs: Dict[str, Any] = {
        "model": model,
+        "_model_override_explicit": bool(model_override),
        "image_url": image_url,
        "reference_image_urls": reference_image_urls,
        "duration": duration,
--- a/website/docs/guides/xai-grok-oauth.md
+++ b/website/docs/guides/xai-grok-oauth.md
@ -182,7 +182,8 @@ The `x_search` toolset auto-enables whenever xAI credentials (a SuperGrok / X Pr
 | Chat | `grok-4.20-multi-agent-0309` | Multi-agent variant |
 | Image | `grok-imagine-image` | Default; ~5–10 s |
 | Image | `grok-imagine-image-quality` | Higher fidelity; ~10–20 s |
-| Video | `grok-imagine-video` | Text-to-video and image-to-video; up to 7 reference images |
+| Video | `grok-imagine-video` | Text-to-video |
+| Video | `grok-imagine-video-1.5-preview` | Image-to-video; dated alias `grok-imagine-video-1.5-2026-05-30` |
 | TTS | (default voice) | xAI `/v1/tts` endpoint |

 The chat catalog is derived live from the on-disk `models.dev` cache; new xAI releases appear automatically once that cache refreshes. `grok-4.3` is always pinned to the top of the list.
--- a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/guides/xai-grok-oauth.md
+++ b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/guides/xai-grok-oauth.md
@ -180,7 +180,8 @@ hermes tools
 | 对话 | `grok-4.20-multi-agent-0309` | 多 agent 变体 |
 | 图像 | `grok-imagine-image` | 默认；约 5–10 秒 |
 | 图像 | `grok-imagine-image-quality` | 更高保真度；约 10–20 秒 |
-| 视频 | `grok-imagine-video` | 文本转视频和图像转视频；最多 7 张参考图像 |
+| 视频 | `grok-imagine-video` | 文本转视频 |
+| 视频 | `grok-imagine-video-1.5-preview` | 图像转视频；日期别名 `grok-imagine-video-1.5-2026-05-30` |
 | TTS | （默认音色） | xAI `/v1/tts` 端点 |

 对话模型目录从磁盘上的 `models.dev` 缓存实时获取；缓存刷新后，新的 xAI 模型会自动出现。`grok-4.3` 始终固定在列表顶部。