fix(xai): route video models by modality
This commit is contained in:
@ -21,9 +21,12 @@ delivers it.
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import logging
|
||||
import mimetypes
|
||||
import os
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import httpx
|
||||
@ -42,7 +45,9 @@ logger = logging.getLogger(__name__)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
DEFAULT_XAI_BASE_URL = "https://api.x.ai/v1"
|
||||
DEFAULT_MODEL = "grok-imagine-video"
|
||||
DEFAULT_TEXT_TO_VIDEO_MODEL = "grok-imagine-video"
|
||||
DEFAULT_IMAGE_TO_VIDEO_MODEL = "grok-imagine-video-1.5-preview"
|
||||
DEFAULT_MODEL = DEFAULT_TEXT_TO_VIDEO_MODEL
|
||||
DEFAULT_DURATION = 8
|
||||
DEFAULT_ASPECT_RATIO = "16:9"
|
||||
DEFAULT_RESOLUTION = "720p"
|
||||
@ -58,10 +63,18 @@ _MODELS: Dict[str, Dict[str, Any]] = {
|
||||
"grok-imagine-video": {
|
||||
"display": "Grok Imagine Video",
|
||||
"speed": "~60-240s",
|
||||
"strengths": "Text-to-video + image-to-video; up to 7 reference images for style/character.",
|
||||
"price": "see https://docs.x.ai/docs/models",
|
||||
"strengths": "Text-to-video; legacy image-to-video fallback.",
|
||||
"price": "see https://docs.x.ai/developers/models/grok-imagine-video",
|
||||
"modalities": ["text", "image"],
|
||||
},
|
||||
"grok-imagine-video-1.5-preview": {
|
||||
"display": "Grok Imagine Video 1.5 Preview",
|
||||
"speed": "~60-240s",
|
||||
"strengths": "Latest xAI image-to-video model.",
|
||||
"price": "see https://docs.x.ai/developers/models/grok-imagine-video-1.5-preview",
|
||||
"modalities": ["image"],
|
||||
"aliases": ["grok-imagine-video-1.5-2026-05-30"],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@ -111,10 +124,31 @@ def _xai_headers(api_key: str) -> Dict[str, str]:
|
||||
}
|
||||
|
||||
|
||||
def _image_ref_to_xai_url(value: str) -> str:
|
||||
"""Return a URL/data URI accepted by xAI for image inputs."""
|
||||
ref = (value or "").strip()
|
||||
if not ref:
|
||||
return ""
|
||||
lower = ref.lower()
|
||||
if lower.startswith(("http://", "https://", "data:image/")):
|
||||
return ref
|
||||
|
||||
path = Path(ref).expanduser()
|
||||
if not path.is_file():
|
||||
return ref
|
||||
|
||||
mime = mimetypes.guess_type(path.name)[0] or "application/octet-stream"
|
||||
if not mime.startswith("image/"):
|
||||
return ref
|
||||
|
||||
encoded = base64.b64encode(path.read_bytes()).decode("ascii")
|
||||
return f"data:{mime};base64,{encoded}"
|
||||
|
||||
|
||||
def _normalize_reference_images(reference_image_urls: Optional[List[str]]):
|
||||
refs = []
|
||||
for url in reference_image_urls or []:
|
||||
normalized = (url or "").strip()
|
||||
normalized = _image_ref_to_xai_url(url)
|
||||
if normalized:
|
||||
refs.append({"url": normalized})
|
||||
return refs or None
|
||||
@ -131,6 +165,28 @@ def _clamp_duration(duration: Optional[int], has_reference_images: bool) -> int:
|
||||
return value
|
||||
|
||||
|
||||
def _resolve_model_for_modality(
|
||||
model: Optional[str],
|
||||
*,
|
||||
modality: str,
|
||||
explicit_model: bool,
|
||||
) -> str:
|
||||
"""Select xAI's text/video model without treating config as a prompt override.
|
||||
|
||||
``grok-imagine-video-1.5-preview`` currently rejects text-only video
|
||||
generation, but it is the desired image-to-video backend. Explicit tool
|
||||
``model=`` still wins for users who intentionally request another model.
|
||||
"""
|
||||
requested = (model or "").strip()
|
||||
if explicit_model and requested:
|
||||
return requested
|
||||
if modality == "image":
|
||||
return DEFAULT_IMAGE_TO_VIDEO_MODEL
|
||||
if requested == DEFAULT_IMAGE_TO_VIDEO_MODEL:
|
||||
return DEFAULT_TEXT_TO_VIDEO_MODEL
|
||||
return requested or DEFAULT_TEXT_TO_VIDEO_MODEL
|
||||
|
||||
|
||||
async def _submit(
|
||||
client: httpx.AsyncClient,
|
||||
payload: Dict[str, Any],
|
||||
@ -192,7 +248,7 @@ async def _poll(
|
||||
|
||||
|
||||
class XAIVideoGenProvider(VideoGenProvider):
|
||||
"""xAI grok-imagine-video backend (text-to-video + image-to-video)."""
|
||||
"""xAI Grok Imagine video backend (text-to-video + image-to-video)."""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
@ -222,7 +278,7 @@ class XAIVideoGenProvider(VideoGenProvider):
|
||||
return {
|
||||
"name": "xAI Grok Imagine",
|
||||
"badge": "paid",
|
||||
"tag": "grok-imagine-video — text-to-video & image-to-video; uses xAI Grok OAuth or XAI_API_KEY",
|
||||
"tag": "grok-imagine-video for text-to-video; grok-imagine-video-1.5-preview for image-to-video; uses xAI Grok OAuth or XAI_API_KEY",
|
||||
"env_vars": [],
|
||||
"post_setup": "xai_grok",
|
||||
}
|
||||
@ -260,6 +316,7 @@ class XAIVideoGenProvider(VideoGenProvider):
|
||||
return loop.run_until_complete(self._generate_async(
|
||||
prompt=prompt,
|
||||
model=model,
|
||||
explicit_model=bool(kwargs.get("_model_override_explicit")),
|
||||
image_url=image_url,
|
||||
reference_image_urls=reference_image_urls,
|
||||
duration=duration,
|
||||
@ -284,6 +341,7 @@ class XAIVideoGenProvider(VideoGenProvider):
|
||||
*,
|
||||
prompt: str,
|
||||
model: Optional[str],
|
||||
explicit_model: bool,
|
||||
image_url: Optional[str],
|
||||
reference_image_urls: Optional[List[str]],
|
||||
duration: Optional[int],
|
||||
@ -303,10 +361,15 @@ class XAIVideoGenProvider(VideoGenProvider):
|
||||
)
|
||||
|
||||
prompt = (prompt or "").strip()
|
||||
image_url_norm = (image_url or "").strip() or None
|
||||
image_url_norm = _image_ref_to_xai_url(image_url or "") or None
|
||||
normalized_aspect_ratio = (aspect_ratio or DEFAULT_ASPECT_RATIO).strip()
|
||||
normalized_resolution = (resolution or DEFAULT_RESOLUTION).strip().lower()
|
||||
modality_used = "image" if image_url_norm else "text"
|
||||
resolved_model = _resolve_model_for_modality(
|
||||
model,
|
||||
modality=modality_used,
|
||||
explicit_model=explicit_model,
|
||||
)
|
||||
|
||||
if not prompt:
|
||||
return error_response(
|
||||
@ -340,7 +403,7 @@ class XAIVideoGenProvider(VideoGenProvider):
|
||||
normalized_resolution = DEFAULT_RESOLUTION
|
||||
|
||||
payload: Dict[str, Any] = {
|
||||
"model": model or DEFAULT_MODEL,
|
||||
"model": resolved_model,
|
||||
"prompt": prompt,
|
||||
"duration": clamped_duration,
|
||||
"aspect_ratio": normalized_aspect_ratio,
|
||||
@ -366,7 +429,7 @@ class XAIVideoGenProvider(VideoGenProvider):
|
||||
error=f"xAI submit failed ({exc.response.status_code}): {detail or exc}",
|
||||
error_type="api_error",
|
||||
provider="xai",
|
||||
model=model or DEFAULT_MODEL,
|
||||
model=resolved_model,
|
||||
prompt=prompt,
|
||||
)
|
||||
|
||||
@ -388,7 +451,7 @@ class XAIVideoGenProvider(VideoGenProvider):
|
||||
error="xAI video generation completed without a video URL",
|
||||
error_type="empty_response",
|
||||
provider="xai",
|
||||
model=body.get("model") or model or DEFAULT_MODEL,
|
||||
model=body.get("model") or resolved_model,
|
||||
prompt=prompt,
|
||||
)
|
||||
extra: Dict[str, Any] = {
|
||||
@ -399,7 +462,7 @@ class XAIVideoGenProvider(VideoGenProvider):
|
||||
extra["usage"] = body["usage"]
|
||||
return success_response(
|
||||
video=url,
|
||||
model=body.get("model") or model or DEFAULT_MODEL,
|
||||
model=body.get("model") or resolved_model,
|
||||
prompt=prompt,
|
||||
modality=modality_used,
|
||||
aspect_ratio=normalized_aspect_ratio,
|
||||
@ -413,7 +476,7 @@ class XAIVideoGenProvider(VideoGenProvider):
|
||||
error=f"Timed out waiting for video generation after {DEFAULT_TIMEOUT_SECONDS}s",
|
||||
error_type="timeout",
|
||||
provider="xai",
|
||||
model=model or DEFAULT_MODEL,
|
||||
model=resolved_model,
|
||||
prompt=prompt,
|
||||
)
|
||||
|
||||
@ -426,7 +489,7 @@ class XAIVideoGenProvider(VideoGenProvider):
|
||||
error=message,
|
||||
error_type=f"xai_{status}",
|
||||
provider="xai",
|
||||
model=model or DEFAULT_MODEL,
|
||||
model=resolved_model,
|
||||
prompt=prompt,
|
||||
)
|
||||
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
name: xai
|
||||
version: 1.0.0
|
||||
description: "xAI Grok-Imagine video generation backend. Supports text-to-video, image-to-video, reference-image-guided generation, video edit, and video extend via the xAI async videos API."
|
||||
description: "xAI Grok Imagine video generation backend. Supports text-to-video, image-to-video, and reference-image-guided generation via the xAI async videos API."
|
||||
author: NousResearch
|
||||
kind: backend
|
||||
requires_env:
|
||||
|
||||
@ -25,6 +25,43 @@ def test_xai_provider_registers():
|
||||
assert provider.default_model() == "grok-imagine-video"
|
||||
|
||||
|
||||
def test_xai_provider_lists_text_and_current_image_video_models():
|
||||
from plugins.video_gen.xai import XAIVideoGenProvider
|
||||
|
||||
models = XAIVideoGenProvider().list_models()
|
||||
ids = [model["id"] for model in models]
|
||||
|
||||
assert ids[0] == "grok-imagine-video"
|
||||
assert ids[1] == "grok-imagine-video-1.5-preview"
|
||||
assert models[1]["modalities"] == ["image"]
|
||||
assert models[1]["aliases"] == ["grok-imagine-video-1.5-2026-05-30"]
|
||||
|
||||
|
||||
def test_xai_routes_default_models_by_modality():
|
||||
from plugins.video_gen.xai import _resolve_model_for_modality
|
||||
|
||||
assert _resolve_model_for_modality(
|
||||
"grok-imagine-video",
|
||||
modality="text",
|
||||
explicit_model=False,
|
||||
) == "grok-imagine-video"
|
||||
assert _resolve_model_for_modality(
|
||||
"grok-imagine-video",
|
||||
modality="image",
|
||||
explicit_model=False,
|
||||
) == "grok-imagine-video-1.5-preview"
|
||||
assert _resolve_model_for_modality(
|
||||
"grok-imagine-video-1.5-preview",
|
||||
modality="text",
|
||||
explicit_model=False,
|
||||
) == "grok-imagine-video"
|
||||
assert _resolve_model_for_modality(
|
||||
"grok-imagine-video-1.5-preview",
|
||||
modality="text",
|
||||
explicit_model=True,
|
||||
) == "grok-imagine-video-1.5-preview"
|
||||
|
||||
|
||||
def test_xai_capabilities_text_and_image_only():
|
||||
"""xAI was previously advertised with edit/extend operations. The
|
||||
simplified surface only exposes text-to-video and image-to-video —
|
||||
|
||||
@ -56,7 +56,7 @@ class _FakeAsyncClient:
|
||||
return _FakeResponse(200, {
|
||||
"status": "done",
|
||||
"video": {"url": "https://xai-cdn/out.mp4", "duration": 8},
|
||||
"model": "grok-imagine-video",
|
||||
"model": self.posts[-1]["json"]["model"],
|
||||
})
|
||||
|
||||
|
||||
@ -113,6 +113,7 @@ class TestXAIPayload:
|
||||
provider, captured = xai_provider
|
||||
provider.generate("a dog at sunset")
|
||||
payload = _last_post(captured)["json"]
|
||||
assert payload["model"] == "grok-imagine-video"
|
||||
assert payload["prompt"] == "a dog at sunset"
|
||||
assert "image" not in payload
|
||||
assert "reference_images" not in payload
|
||||
@ -121,8 +122,31 @@ class TestXAIPayload:
|
||||
provider, captured = xai_provider
|
||||
provider.generate("animate this", image_url="https://example.com/cat.png")
|
||||
payload = _last_post(captured)["json"]
|
||||
assert payload["model"] == "grok-imagine-video-1.5-preview"
|
||||
assert payload["image"] == {"url": "https://example.com/cat.png"}
|
||||
|
||||
def test_local_image_path_is_sent_as_data_uri(self, xai_provider, tmp_path):
|
||||
provider, captured = xai_provider
|
||||
image_path = tmp_path / "frame.png"
|
||||
image_path.write_bytes(b"\x89PNG\r\n\x1a\nfake")
|
||||
|
||||
provider.generate("animate this", image_url=str(image_path))
|
||||
|
||||
payload = _last_post(captured)["json"]
|
||||
assert payload["model"] == "grok-imagine-video-1.5-preview"
|
||||
assert payload["image"]["url"].startswith("data:image/png;base64,")
|
||||
|
||||
def test_explicit_model_override_is_honored_for_image(self, xai_provider):
|
||||
provider, captured = xai_provider
|
||||
provider.generate(
|
||||
"animate this",
|
||||
image_url="https://example.com/cat.png",
|
||||
model="grok-imagine-video",
|
||||
_model_override_explicit=True,
|
||||
)
|
||||
payload = _last_post(captured)["json"]
|
||||
assert payload["model"] == "grok-imagine-video"
|
||||
|
||||
def test_reference_images_payload(self, xai_provider):
|
||||
provider, captured = xai_provider
|
||||
provider.generate(
|
||||
|
||||
@ -82,7 +82,7 @@ def matrix_env(tmp_path, monkeypatch):
|
||||
return _Resp({
|
||||
"status": "done",
|
||||
"video": {"url": "https://xai-cdn/out.mp4", "duration": 8},
|
||||
"model": "grok-imagine-video",
|
||||
"model": xai_calls[-1]["json"].get("model", "grok-imagine-video"),
|
||||
})
|
||||
import plugins.video_gen.xai as xai_plugin
|
||||
monkeypatch.setattr(xai_plugin.httpx, "AsyncClient", lambda: _Client())
|
||||
@ -202,6 +202,7 @@ def test_xai_text_only_via_tool_surface(matrix_env):
|
||||
assert len(xai_calls) == 1
|
||||
assert xai_calls[0]["url"].endswith("/videos/generations")
|
||||
payload = xai_calls[0]["json"] or {}
|
||||
assert payload["model"] == "grok-imagine-video"
|
||||
assert "image" not in payload
|
||||
assert "reference_images" not in payload
|
||||
|
||||
@ -221,6 +222,26 @@ def test_xai_text_plus_image_via_tool_surface(matrix_env):
|
||||
assert len(xai_calls) == 1
|
||||
assert xai_calls[0]["url"].endswith("/videos/generations")
|
||||
payload = xai_calls[0]["json"] or {}
|
||||
assert payload["model"] == "grok-imagine-video-1.5-preview"
|
||||
assert payload["image"] == {"url": "https://example.com/img.png"}
|
||||
|
||||
|
||||
def test_xai_explicit_model_override_via_tool_surface(matrix_env):
|
||||
home, _, xai_calls = matrix_env
|
||||
|
||||
result = _invoke_tool(
|
||||
home,
|
||||
{"video_gen": {"provider": "xai"}},
|
||||
{
|
||||
"prompt": "animate this",
|
||||
"image_url": "https://example.com/img.png",
|
||||
"model": "grok-imagine-video",
|
||||
},
|
||||
)
|
||||
assert result["success"] is True
|
||||
|
||||
payload = xai_calls[0]["json"] or {}
|
||||
assert payload["model"] == "grok-imagine-video"
|
||||
assert payload["image"] == {"url": "https://example.com/img.png"}
|
||||
|
||||
|
||||
|
||||
@ -336,6 +336,7 @@ def _handle_video_generate(args: Dict[str, Any], **_kw: Any) -> str:
|
||||
|
||||
kwargs: Dict[str, Any] = {
|
||||
"model": model,
|
||||
"_model_override_explicit": bool(model_override),
|
||||
"image_url": image_url,
|
||||
"reference_image_urls": reference_image_urls,
|
||||
"duration": duration,
|
||||
|
||||
@ -182,7 +182,8 @@ The `x_search` toolset auto-enables whenever xAI credentials (a SuperGrok / X Pr
|
||||
| Chat | `grok-4.20-multi-agent-0309` | Multi-agent variant |
|
||||
| Image | `grok-imagine-image` | Default; ~5–10 s |
|
||||
| Image | `grok-imagine-image-quality` | Higher fidelity; ~10–20 s |
|
||||
| Video | `grok-imagine-video` | Text-to-video and image-to-video; up to 7 reference images |
|
||||
| Video | `grok-imagine-video` | Text-to-video |
|
||||
| Video | `grok-imagine-video-1.5-preview` | Image-to-video; dated alias `grok-imagine-video-1.5-2026-05-30` |
|
||||
| TTS | (default voice) | xAI `/v1/tts` endpoint |
|
||||
|
||||
The chat catalog is derived live from the on-disk `models.dev` cache; new xAI releases appear automatically once that cache refreshes. `grok-4.3` is always pinned to the top of the list.
|
||||
|
||||
@ -180,7 +180,8 @@ hermes tools
|
||||
| 对话 | `grok-4.20-multi-agent-0309` | 多 agent 变体 |
|
||||
| 图像 | `grok-imagine-image` | 默认;约 5–10 秒 |
|
||||
| 图像 | `grok-imagine-image-quality` | 更高保真度;约 10–20 秒 |
|
||||
| 视频 | `grok-imagine-video` | 文本转视频和图像转视频;最多 7 张参考图像 |
|
||||
| 视频 | `grok-imagine-video` | 文本转视频 |
|
||||
| 视频 | `grok-imagine-video-1.5-preview` | 图像转视频;日期别名 `grok-imagine-video-1.5-2026-05-30` |
|
||||
| TTS | (默认音色) | xAI `/v1/tts` 端点 |
|
||||
|
||||
对话模型目录从磁盘上的 `models.dev` 缓存实时获取;缓存刷新后,新的 xAI 模型会自动出现。`grok-4.3` 始终固定在列表顶部。
|
||||
@ -266,4 +267,4 @@ hermes auth logout xai-oauth
|
||||
- [AI Providers 参考](../integrations/providers.md)
|
||||
- [环境变量](../reference/environment-variables.md)
|
||||
- [配置](../user-guide/configuration.md)
|
||||
- [语音与 TTS](../user-guide/features/tts.md)
|
||||
- [语音与 TTS](../user-guide/features/tts.md)
|
||||
|
||||
Reference in New Issue
Block a user