diff --git a/plugins/video_gen/xai/__init__.py b/plugins/video_gen/xai/__init__.py index 869c2feef..308837b61 100644 --- a/plugins/video_gen/xai/__init__.py +++ b/plugins/video_gen/xai/__init__.py @@ -21,9 +21,12 @@ delivers it. from __future__ import annotations import asyncio +import base64 import logging +import mimetypes import os import uuid +from pathlib import Path from typing import Any, Dict, List, Optional, Tuple import httpx @@ -42,7 +45,9 @@ logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- DEFAULT_XAI_BASE_URL = "https://api.x.ai/v1" -DEFAULT_MODEL = "grok-imagine-video" +DEFAULT_TEXT_TO_VIDEO_MODEL = "grok-imagine-video" +DEFAULT_IMAGE_TO_VIDEO_MODEL = "grok-imagine-video-1.5-preview" +DEFAULT_MODEL = DEFAULT_TEXT_TO_VIDEO_MODEL DEFAULT_DURATION = 8 DEFAULT_ASPECT_RATIO = "16:9" DEFAULT_RESOLUTION = "720p" @@ -58,10 +63,18 @@ _MODELS: Dict[str, Dict[str, Any]] = { "grok-imagine-video": { "display": "Grok Imagine Video", "speed": "~60-240s", - "strengths": "Text-to-video + image-to-video; up to 7 reference images for style/character.", - "price": "see https://docs.x.ai/docs/models", + "strengths": "Text-to-video; legacy image-to-video fallback.", + "price": "see https://docs.x.ai/developers/models/grok-imagine-video", "modalities": ["text", "image"], }, + "grok-imagine-video-1.5-preview": { + "display": "Grok Imagine Video 1.5 Preview", + "speed": "~60-240s", + "strengths": "Latest xAI image-to-video model.", + "price": "see https://docs.x.ai/developers/models/grok-imagine-video-1.5-preview", + "modalities": ["image"], + "aliases": ["grok-imagine-video-1.5-2026-05-30"], + }, } @@ -111,10 +124,31 @@ def _xai_headers(api_key: str) -> Dict[str, str]: } +def _image_ref_to_xai_url(value: str) -> str: + """Return a URL/data URI accepted by xAI for image inputs.""" + ref = (value or "").strip() + if not ref: + return "" + lower = ref.lower() + if lower.startswith(("http://", "https://", "data:image/")): + return ref + + path = Path(ref).expanduser() + if not path.is_file(): + return ref + + mime = mimetypes.guess_type(path.name)[0] or "application/octet-stream" + if not mime.startswith("image/"): + return ref + + encoded = base64.b64encode(path.read_bytes()).decode("ascii") + return f"data:{mime};base64,{encoded}" + + def _normalize_reference_images(reference_image_urls: Optional[List[str]]): refs = [] for url in reference_image_urls or []: - normalized = (url or "").strip() + normalized = _image_ref_to_xai_url(url) if normalized: refs.append({"url": normalized}) return refs or None @@ -131,6 +165,28 @@ def _clamp_duration(duration: Optional[int], has_reference_images: bool) -> int: return value +def _resolve_model_for_modality( + model: Optional[str], + *, + modality: str, + explicit_model: bool, +) -> str: + """Select xAI's text/video model without treating config as a prompt override. + + ``grok-imagine-video-1.5-preview`` currently rejects text-only video + generation, but it is the desired image-to-video backend. Explicit tool + ``model=`` still wins for users who intentionally request another model. + """ + requested = (model or "").strip() + if explicit_model and requested: + return requested + if modality == "image": + return DEFAULT_IMAGE_TO_VIDEO_MODEL + if requested == DEFAULT_IMAGE_TO_VIDEO_MODEL: + return DEFAULT_TEXT_TO_VIDEO_MODEL + return requested or DEFAULT_TEXT_TO_VIDEO_MODEL + + async def _submit( client: httpx.AsyncClient, payload: Dict[str, Any], @@ -192,7 +248,7 @@ async def _poll( class XAIVideoGenProvider(VideoGenProvider): - """xAI grok-imagine-video backend (text-to-video + image-to-video).""" + """xAI Grok Imagine video backend (text-to-video + image-to-video).""" @property def name(self) -> str: @@ -222,7 +278,7 @@ class XAIVideoGenProvider(VideoGenProvider): return { "name": "xAI Grok Imagine", "badge": "paid", - "tag": "grok-imagine-video — text-to-video & image-to-video; uses xAI Grok OAuth or XAI_API_KEY", + "tag": "grok-imagine-video for text-to-video; grok-imagine-video-1.5-preview for image-to-video; uses xAI Grok OAuth or XAI_API_KEY", "env_vars": [], "post_setup": "xai_grok", } @@ -260,6 +316,7 @@ class XAIVideoGenProvider(VideoGenProvider): return loop.run_until_complete(self._generate_async( prompt=prompt, model=model, + explicit_model=bool(kwargs.get("_model_override_explicit")), image_url=image_url, reference_image_urls=reference_image_urls, duration=duration, @@ -284,6 +341,7 @@ class XAIVideoGenProvider(VideoGenProvider): *, prompt: str, model: Optional[str], + explicit_model: bool, image_url: Optional[str], reference_image_urls: Optional[List[str]], duration: Optional[int], @@ -303,10 +361,15 @@ class XAIVideoGenProvider(VideoGenProvider): ) prompt = (prompt or "").strip() - image_url_norm = (image_url or "").strip() or None + image_url_norm = _image_ref_to_xai_url(image_url or "") or None normalized_aspect_ratio = (aspect_ratio or DEFAULT_ASPECT_RATIO).strip() normalized_resolution = (resolution or DEFAULT_RESOLUTION).strip().lower() modality_used = "image" if image_url_norm else "text" + resolved_model = _resolve_model_for_modality( + model, + modality=modality_used, + explicit_model=explicit_model, + ) if not prompt: return error_response( @@ -340,7 +403,7 @@ class XAIVideoGenProvider(VideoGenProvider): normalized_resolution = DEFAULT_RESOLUTION payload: Dict[str, Any] = { - "model": model or DEFAULT_MODEL, + "model": resolved_model, "prompt": prompt, "duration": clamped_duration, "aspect_ratio": normalized_aspect_ratio, @@ -366,7 +429,7 @@ class XAIVideoGenProvider(VideoGenProvider): error=f"xAI submit failed ({exc.response.status_code}): {detail or exc}", error_type="api_error", provider="xai", - model=model or DEFAULT_MODEL, + model=resolved_model, prompt=prompt, ) @@ -388,7 +451,7 @@ class XAIVideoGenProvider(VideoGenProvider): error="xAI video generation completed without a video URL", error_type="empty_response", provider="xai", - model=body.get("model") or model or DEFAULT_MODEL, + model=body.get("model") or resolved_model, prompt=prompt, ) extra: Dict[str, Any] = { @@ -399,7 +462,7 @@ class XAIVideoGenProvider(VideoGenProvider): extra["usage"] = body["usage"] return success_response( video=url, - model=body.get("model") or model or DEFAULT_MODEL, + model=body.get("model") or resolved_model, prompt=prompt, modality=modality_used, aspect_ratio=normalized_aspect_ratio, @@ -413,7 +476,7 @@ class XAIVideoGenProvider(VideoGenProvider): error=f"Timed out waiting for video generation after {DEFAULT_TIMEOUT_SECONDS}s", error_type="timeout", provider="xai", - model=model or DEFAULT_MODEL, + model=resolved_model, prompt=prompt, ) @@ -426,7 +489,7 @@ class XAIVideoGenProvider(VideoGenProvider): error=message, error_type=f"xai_{status}", provider="xai", - model=model or DEFAULT_MODEL, + model=resolved_model, prompt=prompt, ) diff --git a/plugins/video_gen/xai/plugin.yaml b/plugins/video_gen/xai/plugin.yaml index 85aa6e68f..5e3e8b1ac 100644 --- a/plugins/video_gen/xai/plugin.yaml +++ b/plugins/video_gen/xai/plugin.yaml @@ -1,6 +1,6 @@ name: xai version: 1.0.0 -description: "xAI Grok-Imagine video generation backend. Supports text-to-video, image-to-video, reference-image-guided generation, video edit, and video extend via the xAI async videos API." +description: "xAI Grok Imagine video generation backend. Supports text-to-video, image-to-video, and reference-image-guided generation via the xAI async videos API." author: NousResearch kind: backend requires_env: diff --git a/tests/plugins/video_gen/test_xai_plugin.py b/tests/plugins/video_gen/test_xai_plugin.py index 4c365020a..3df9086af 100644 --- a/tests/plugins/video_gen/test_xai_plugin.py +++ b/tests/plugins/video_gen/test_xai_plugin.py @@ -25,6 +25,43 @@ def test_xai_provider_registers(): assert provider.default_model() == "grok-imagine-video" +def test_xai_provider_lists_text_and_current_image_video_models(): + from plugins.video_gen.xai import XAIVideoGenProvider + + models = XAIVideoGenProvider().list_models() + ids = [model["id"] for model in models] + + assert ids[0] == "grok-imagine-video" + assert ids[1] == "grok-imagine-video-1.5-preview" + assert models[1]["modalities"] == ["image"] + assert models[1]["aliases"] == ["grok-imagine-video-1.5-2026-05-30"] + + +def test_xai_routes_default_models_by_modality(): + from plugins.video_gen.xai import _resolve_model_for_modality + + assert _resolve_model_for_modality( + "grok-imagine-video", + modality="text", + explicit_model=False, + ) == "grok-imagine-video" + assert _resolve_model_for_modality( + "grok-imagine-video", + modality="image", + explicit_model=False, + ) == "grok-imagine-video-1.5-preview" + assert _resolve_model_for_modality( + "grok-imagine-video-1.5-preview", + modality="text", + explicit_model=False, + ) == "grok-imagine-video" + assert _resolve_model_for_modality( + "grok-imagine-video-1.5-preview", + modality="text", + explicit_model=True, + ) == "grok-imagine-video-1.5-preview" + + def test_xai_capabilities_text_and_image_only(): """xAI was previously advertised with edit/extend operations. The simplified surface only exposes text-to-video and image-to-video — diff --git a/tests/plugins/video_gen/test_xai_plugin_integration.py b/tests/plugins/video_gen/test_xai_plugin_integration.py index 31d44f15b..22693d763 100644 --- a/tests/plugins/video_gen/test_xai_plugin_integration.py +++ b/tests/plugins/video_gen/test_xai_plugin_integration.py @@ -56,7 +56,7 @@ class _FakeAsyncClient: return _FakeResponse(200, { "status": "done", "video": {"url": "https://xai-cdn/out.mp4", "duration": 8}, - "model": "grok-imagine-video", + "model": self.posts[-1]["json"]["model"], }) @@ -113,6 +113,7 @@ class TestXAIPayload: provider, captured = xai_provider provider.generate("a dog at sunset") payload = _last_post(captured)["json"] + assert payload["model"] == "grok-imagine-video" assert payload["prompt"] == "a dog at sunset" assert "image" not in payload assert "reference_images" not in payload @@ -121,8 +122,31 @@ class TestXAIPayload: provider, captured = xai_provider provider.generate("animate this", image_url="https://example.com/cat.png") payload = _last_post(captured)["json"] + assert payload["model"] == "grok-imagine-video-1.5-preview" assert payload["image"] == {"url": "https://example.com/cat.png"} + def test_local_image_path_is_sent_as_data_uri(self, xai_provider, tmp_path): + provider, captured = xai_provider + image_path = tmp_path / "frame.png" + image_path.write_bytes(b"\x89PNG\r\n\x1a\nfake") + + provider.generate("animate this", image_url=str(image_path)) + + payload = _last_post(captured)["json"] + assert payload["model"] == "grok-imagine-video-1.5-preview" + assert payload["image"]["url"].startswith("data:image/png;base64,") + + def test_explicit_model_override_is_honored_for_image(self, xai_provider): + provider, captured = xai_provider + provider.generate( + "animate this", + image_url="https://example.com/cat.png", + model="grok-imagine-video", + _model_override_explicit=True, + ) + payload = _last_post(captured)["json"] + assert payload["model"] == "grok-imagine-video" + def test_reference_images_payload(self, xai_provider): provider, captured = xai_provider provider.generate( diff --git a/tests/tools/test_video_generation_tool_surface_matrix.py b/tests/tools/test_video_generation_tool_surface_matrix.py index edd39b550..dfe1c762b 100644 --- a/tests/tools/test_video_generation_tool_surface_matrix.py +++ b/tests/tools/test_video_generation_tool_surface_matrix.py @@ -82,7 +82,7 @@ def matrix_env(tmp_path, monkeypatch): return _Resp({ "status": "done", "video": {"url": "https://xai-cdn/out.mp4", "duration": 8}, - "model": "grok-imagine-video", + "model": xai_calls[-1]["json"].get("model", "grok-imagine-video"), }) import plugins.video_gen.xai as xai_plugin monkeypatch.setattr(xai_plugin.httpx, "AsyncClient", lambda: _Client()) @@ -202,6 +202,7 @@ def test_xai_text_only_via_tool_surface(matrix_env): assert len(xai_calls) == 1 assert xai_calls[0]["url"].endswith("/videos/generations") payload = xai_calls[0]["json"] or {} + assert payload["model"] == "grok-imagine-video" assert "image" not in payload assert "reference_images" not in payload @@ -221,6 +222,26 @@ def test_xai_text_plus_image_via_tool_surface(matrix_env): assert len(xai_calls) == 1 assert xai_calls[0]["url"].endswith("/videos/generations") payload = xai_calls[0]["json"] or {} + assert payload["model"] == "grok-imagine-video-1.5-preview" + assert payload["image"] == {"url": "https://example.com/img.png"} + + +def test_xai_explicit_model_override_via_tool_surface(matrix_env): + home, _, xai_calls = matrix_env + + result = _invoke_tool( + home, + {"video_gen": {"provider": "xai"}}, + { + "prompt": "animate this", + "image_url": "https://example.com/img.png", + "model": "grok-imagine-video", + }, + ) + assert result["success"] is True + + payload = xai_calls[0]["json"] or {} + assert payload["model"] == "grok-imagine-video" assert payload["image"] == {"url": "https://example.com/img.png"} diff --git a/tools/video_generation_tool.py b/tools/video_generation_tool.py index 472b84092..2465199f3 100644 --- a/tools/video_generation_tool.py +++ b/tools/video_generation_tool.py @@ -336,6 +336,7 @@ def _handle_video_generate(args: Dict[str, Any], **_kw: Any) -> str: kwargs: Dict[str, Any] = { "model": model, + "_model_override_explicit": bool(model_override), "image_url": image_url, "reference_image_urls": reference_image_urls, "duration": duration, diff --git a/website/docs/guides/xai-grok-oauth.md b/website/docs/guides/xai-grok-oauth.md index 2a9841687..bd30e5db1 100644 --- a/website/docs/guides/xai-grok-oauth.md +++ b/website/docs/guides/xai-grok-oauth.md @@ -182,7 +182,8 @@ The `x_search` toolset auto-enables whenever xAI credentials (a SuperGrok / X Pr | Chat | `grok-4.20-multi-agent-0309` | Multi-agent variant | | Image | `grok-imagine-image` | Default; ~5–10 s | | Image | `grok-imagine-image-quality` | Higher fidelity; ~10–20 s | -| Video | `grok-imagine-video` | Text-to-video and image-to-video; up to 7 reference images | +| Video | `grok-imagine-video` | Text-to-video | +| Video | `grok-imagine-video-1.5-preview` | Image-to-video; dated alias `grok-imagine-video-1.5-2026-05-30` | | TTS | (default voice) | xAI `/v1/tts` endpoint | The chat catalog is derived live from the on-disk `models.dev` cache; new xAI releases appear automatically once that cache refreshes. `grok-4.3` is always pinned to the top of the list. diff --git a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/guides/xai-grok-oauth.md b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/guides/xai-grok-oauth.md index 6f6f0cab1..1d3fd02f4 100644 --- a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/guides/xai-grok-oauth.md +++ b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/guides/xai-grok-oauth.md @@ -180,7 +180,8 @@ hermes tools | 对话 | `grok-4.20-multi-agent-0309` | 多 agent 变体 | | 图像 | `grok-imagine-image` | 默认;约 5–10 秒 | | 图像 | `grok-imagine-image-quality` | 更高保真度;约 10–20 秒 | -| 视频 | `grok-imagine-video` | 文本转视频和图像转视频;最多 7 张参考图像 | +| 视频 | `grok-imagine-video` | 文本转视频 | +| 视频 | `grok-imagine-video-1.5-preview` | 图像转视频;日期别名 `grok-imagine-video-1.5-2026-05-30` | | TTS | (默认音色) | xAI `/v1/tts` 端点 | 对话模型目录从磁盘上的 `models.dev` 缓存实时获取;缓存刷新后,新的 xAI 模型会自动出现。`grok-4.3` 始终固定在列表顶部。 @@ -266,4 +267,4 @@ hermes auth logout xai-oauth - [AI Providers 参考](../integrations/providers.md) - [环境变量](../reference/environment-variables.md) - [配置](../user-guide/configuration.md) -- [语音与 TTS](../user-guide/features/tts.md) \ No newline at end of file +- [语音与 TTS](../user-guide/features/tts.md)