diff --git a/hermes_cli/tools_config.py b/hermes_cli/tools_config.py index 6d272ebcc..8bfbc059f 100644 --- a/hermes_cli/tools_config.py +++ b/hermes_cli/tools_config.py @@ -258,14 +258,16 @@ TOOL_CATEGORIES = { "requires_nous_auth": True, "managed_nous_feature": "image_gen", "override_env_vars": ["FAL_KEY"], + "imagegen_backend": "fal", }, { "name": "FAL.ai", "badge": "paid", - "tag": "FLUX 2 Pro with auto-upscaling", + "tag": "Pick from flux-2-klein, flux-2-pro, gpt-image, nano-banana, etc.", "env_vars": [ {"key": "FAL_KEY", "prompt": "FAL API key", "url": "https://fal.ai/dashboard/keys"}, ], + "imagegen_backend": "fal", }, ], }, @@ -950,6 +952,106 @@ def _detect_active_provider_index(providers: list, config: dict) -> int: return 0 +# ─── Image Generation Model Pickers ─────────────────────────────────────────── +# +# IMAGEGEN_BACKENDS is a per-backend catalog. Each entry exposes: +# - config_key: top-level config.yaml key for this backend's settings +# - model_catalog_fn: returns an OrderedDict-like {model_id: metadata} +# - default_model: fallback when nothing is configured +# +# This prepares for future imagegen backends (Replicate, Stability, etc.): +# each new backend registers its own entry; the FAL provider entry in +# TOOL_CATEGORIES tags itself with `imagegen_backend: "fal"` to select the +# right catalog at picker time. + + +def _fal_model_catalog(): + """Lazy-load the FAL model catalog from the tool module.""" + from tools.image_generation_tool import FAL_MODELS, DEFAULT_MODEL + return FAL_MODELS, DEFAULT_MODEL + + +IMAGEGEN_BACKENDS = { + "fal": { + "display": "FAL.ai", + "config_key": "image_gen", + "catalog_fn": _fal_model_catalog, + }, +} + + +def _format_imagegen_model_row(model_id: str, meta: dict, widths: dict) -> str: + """Format a single picker row with column-aligned speed / strengths / price.""" + return ( + f"{model_id:<{widths['model']}} " + f"{meta.get('speed', ''):<{widths['speed']}} " + f"{meta.get('strengths', ''):<{widths['strengths']}} " + f"{meta.get('price', '')}" + ) + + +def _configure_imagegen_model(backend_name: str, config: dict) -> None: + """Prompt the user to pick a model for the given imagegen backend. + + Writes selection to ``config[backend_config_key]["model"]``. Safe to + call even when stdin is not a TTY — curses_radiolist falls back to + keeping the current selection. + """ + backend = IMAGEGEN_BACKENDS.get(backend_name) + if not backend: + return + + catalog, default_model = backend["catalog_fn"]() + if not catalog: + return + + cfg_key = backend["config_key"] + cur_cfg = config.setdefault(cfg_key, {}) + if not isinstance(cur_cfg, dict): + cur_cfg = {} + config[cfg_key] = cur_cfg + current_model = cur_cfg.get("model") or default_model + if current_model not in catalog: + current_model = default_model + + model_ids = list(catalog.keys()) + # Put current model at the top so the cursor lands on it by default. + ordered = [current_model] + [m for m in model_ids if m != current_model] + + # Column widths + widths = { + "model": max(len(m) for m in model_ids), + "speed": max((len(catalog[m].get("speed", "")) for m in model_ids), default=6), + "strengths": max((len(catalog[m].get("strengths", "")) for m in model_ids), default=0), + } + + print() + header = ( + f" {'Model':<{widths['model']}} " + f"{'Speed':<{widths['speed']}} " + f"{'Strengths':<{widths['strengths']}} " + f"Price" + ) + print(color(header, Colors.CYAN)) + + rows = [] + for mid in ordered: + row = _format_imagegen_model_row(mid, catalog[mid], widths) + if mid == current_model: + row += " ← currently in use" + rows.append(row) + + idx = _prompt_choice( + f" Choose {backend['display']} model:", + rows, + default=0, + ) + + chosen = ordered[idx] + cur_cfg["model"] = chosen + _print_success(f" Model set to: {chosen}") + + def _configure_provider(provider: dict, config: dict): """Configure a single provider - prompt for API keys and set config.""" env_vars = provider.get("env_vars", []) @@ -1006,6 +1108,10 @@ def _configure_provider(provider: dict, config: dict): _print_success(f" {provider['name']} - no configuration needed!") if managed_feature: _print_info(" Requests for this tool will be billed to your Nous subscription.") + # Imagegen backends prompt for model selection after backend pick. + backend = provider.get("imagegen_backend") + if backend: + _configure_imagegen_model(backend, config) return # Prompt for each required env var @@ -1040,6 +1146,10 @@ def _configure_provider(provider: dict, config: dict): if all_configured: _print_success(f" {provider['name']} configured!") + # Imagegen backends prompt for model selection after env vars are in. + backend = provider.get("imagegen_backend") + if backend: + _configure_imagegen_model(backend, config) def _configure_simple_requirements(ts_key: str): @@ -1211,6 +1321,10 @@ def _reconfigure_provider(provider: dict, config: dict): _print_success(f" {provider['name']} - no configuration needed!") if managed_feature: _print_info(" Requests for this tool will be billed to your Nous subscription.") + # Imagegen backends prompt for model selection on reconfig too. + backend = provider.get("imagegen_backend") + if backend: + _configure_imagegen_model(backend, config) return for var in env_vars: @@ -1228,6 +1342,11 @@ def _reconfigure_provider(provider: dict, config: dict): else: _print_info(" Kept current") + # Imagegen backends prompt for model selection on reconfig too. + backend = provider.get("imagegen_backend") + if backend: + _configure_imagegen_model(backend, config) + def _reconfigure_simple_requirements(ts_key: str): """Reconfigure simple env var requirements.""" diff --git a/tests/hermes_cli/test_tools_config.py b/tests/hermes_cli/test_tools_config.py index 09765c440..3a72490b4 100644 --- a/tests/hermes_cli/test_tools_config.py +++ b/tests/hermes_cli/test_tools_config.py @@ -466,3 +466,90 @@ def test_numeric_mcp_server_name_does_not_crash_sorted(): # sorted() must not raise TypeError sorted(enabled) + + +# ─── Imagegen Backend Picker Wiring ──────────────────────────────────────── + +class TestImagegenBackendRegistry: + """IMAGEGEN_BACKENDS tags drive the model picker flow in tools_config.""" + + def test_fal_backend_registered(self): + from hermes_cli.tools_config import IMAGEGEN_BACKENDS + assert "fal" in IMAGEGEN_BACKENDS + + def test_fal_catalog_loads_lazily(self): + """catalog_fn should defer import to avoid import cycles.""" + from hermes_cli.tools_config import IMAGEGEN_BACKENDS + catalog, default = IMAGEGEN_BACKENDS["fal"]["catalog_fn"]() + assert default == "fal-ai/flux-2/klein/9b" + assert "fal-ai/flux-2/klein/9b" in catalog + assert "fal-ai/flux-2-pro" in catalog + + def test_image_gen_providers_tagged_with_fal_backend(self): + """Both Nous Subscription and FAL.ai providers must carry the + imagegen_backend tag so _configure_provider fires the picker.""" + from hermes_cli.tools_config import TOOL_CATEGORIES + providers = TOOL_CATEGORIES["image_gen"]["providers"] + for p in providers: + assert p.get("imagegen_backend") == "fal", ( + f"{p['name']} missing imagegen_backend tag" + ) + + +class TestImagegenModelPicker: + """_configure_imagegen_model writes selection to config and respects + curses fallback semantics (returns default when stdin isn't a TTY).""" + + def test_picker_writes_chosen_model_to_config(self): + from hermes_cli.tools_config import _configure_imagegen_model + config = {} + # Force _prompt_choice to pick index 1 (second-in-ordered-list). + with patch("hermes_cli.tools_config._prompt_choice", return_value=1): + _configure_imagegen_model("fal", config) + # ordered[0] == current (default klein), ordered[1] == first non-default + assert config["image_gen"]["model"] != "fal-ai/flux-2/klein/9b" + assert config["image_gen"]["model"].startswith("fal-ai/") + + def test_picker_with_gpt_image_does_not_prompt_quality(self): + """GPT-Image quality is pinned to medium in the tool's defaults — + no follow-up prompt, no config write for quality_setting.""" + from hermes_cli.tools_config import ( + _configure_imagegen_model, + IMAGEGEN_BACKENDS, + ) + catalog, default_model = IMAGEGEN_BACKENDS["fal"]["catalog_fn"]() + model_ids = list(catalog.keys()) + ordered = [default_model] + [m for m in model_ids if m != default_model] + gpt_idx = ordered.index("fal-ai/gpt-image-1.5") + + # Only ONE picker call is expected (for model) — not two (model + quality). + call_count = {"n": 0} + def fake_prompt(*a, **kw): + call_count["n"] += 1 + return gpt_idx + + config = {} + with patch("hermes_cli.tools_config._prompt_choice", side_effect=fake_prompt): + _configure_imagegen_model("fal", config) + + assert call_count["n"] == 1, ( + f"Expected 1 picker call (model only), got {call_count['n']}" + ) + assert config["image_gen"]["model"] == "fal-ai/gpt-image-1.5" + assert "quality_setting" not in config["image_gen"] + + def test_picker_no_op_for_unknown_backend(self): + from hermes_cli.tools_config import _configure_imagegen_model + config = {} + _configure_imagegen_model("nonexistent-backend", config) + assert config == {} # untouched + + def test_picker_repairs_corrupt_config_section(self): + """When image_gen is a non-dict (user-edit YAML), the picker should + replace it with a fresh dict rather than crash.""" + from hermes_cli.tools_config import _configure_imagegen_model + config = {"image_gen": "some-garbage-string"} + with patch("hermes_cli.tools_config._prompt_choice", return_value=0): + _configure_imagegen_model("fal", config) + assert isinstance(config["image_gen"], dict) + assert config["image_gen"]["model"] == "fal-ai/flux-2/klein/9b" diff --git a/tests/tools/test_image_generation.py b/tests/tools/test_image_generation.py new file mode 100644 index 000000000..cf4e08706 --- /dev/null +++ b/tests/tools/test_image_generation.py @@ -0,0 +1,450 @@ +"""Tests for tools/image_generation_tool.py — FAL multi-model support. + +Covers the pure logic of the new wrapper: catalog integrity, the three size +families (image_size_preset / aspect_ratio / gpt_literal), the supports +whitelist, default merging, GPT quality override, and model resolution +fallback. Does NOT exercise fal_client submission — that's covered by +tests/tools/test_managed_media_gateways.py. +""" + +from __future__ import annotations + +from unittest.mock import patch + +import pytest + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture +def image_tool(): + """Fresh import of tools.image_generation_tool per test.""" + import importlib + import tools.image_generation_tool as mod + return importlib.reload(mod) + + +# --------------------------------------------------------------------------- +# Catalog integrity +# --------------------------------------------------------------------------- + +class TestFalCatalog: + """Every FAL_MODELS entry must have a consistent shape.""" + + def test_default_model_is_klein(self, image_tool): + assert image_tool.DEFAULT_MODEL == "fal-ai/flux-2/klein/9b" + + def test_default_model_in_catalog(self, image_tool): + assert image_tool.DEFAULT_MODEL in image_tool.FAL_MODELS + + def test_all_entries_have_required_keys(self, image_tool): + required = { + "display", "speed", "strengths", "price", + "size_style", "sizes", "defaults", "supports", "upscale", + } + for mid, meta in image_tool.FAL_MODELS.items(): + missing = required - set(meta.keys()) + assert not missing, f"{mid} missing required keys: {missing}" + + def test_size_style_is_valid(self, image_tool): + valid = {"image_size_preset", "aspect_ratio", "gpt_literal"} + for mid, meta in image_tool.FAL_MODELS.items(): + assert meta["size_style"] in valid, \ + f"{mid} has invalid size_style: {meta['size_style']}" + + def test_sizes_cover_all_aspect_ratios(self, image_tool): + for mid, meta in image_tool.FAL_MODELS.items(): + assert set(meta["sizes"].keys()) >= {"landscape", "square", "portrait"}, \ + f"{mid} missing a required aspect_ratio key" + + def test_supports_is_a_set(self, image_tool): + for mid, meta in image_tool.FAL_MODELS.items(): + assert isinstance(meta["supports"], set), \ + f"{mid}.supports must be a set, got {type(meta['supports'])}" + + def test_prompt_is_always_supported(self, image_tool): + for mid, meta in image_tool.FAL_MODELS.items(): + assert "prompt" in meta["supports"], \ + f"{mid} must support 'prompt'" + + def test_only_flux2_pro_upscales_by_default(self, image_tool): + """Upscaling should default to False for all new models to preserve + the <1s / fast-render value prop. Only flux-2-pro stays True for + backward-compat with the previous default.""" + for mid, meta in image_tool.FAL_MODELS.items(): + if mid == "fal-ai/flux-2-pro": + assert meta["upscale"] is True, \ + "flux-2-pro should keep upscale=True for backward-compat" + else: + assert meta["upscale"] is False, \ + f"{mid} should default to upscale=False" + + +# --------------------------------------------------------------------------- +# Payload building — three size families +# --------------------------------------------------------------------------- + +class TestImageSizePresetFamily: + """Flux, z-image, qwen, recraft, ideogram all use preset enum sizes.""" + + def test_klein_landscape_uses_preset(self, image_tool): + p = image_tool._build_fal_payload("fal-ai/flux-2/klein/9b", "hello", "landscape") + assert p["image_size"] == "landscape_16_9" + assert "aspect_ratio" not in p + + def test_klein_square_uses_preset(self, image_tool): + p = image_tool._build_fal_payload("fal-ai/flux-2/klein/9b", "hello", "square") + assert p["image_size"] == "square_hd" + + def test_klein_portrait_uses_preset(self, image_tool): + p = image_tool._build_fal_payload("fal-ai/flux-2/klein/9b", "hello", "portrait") + assert p["image_size"] == "portrait_16_9" + + +class TestAspectRatioFamily: + """Nano-banana uses aspect_ratio enum, NOT image_size.""" + + def test_nano_banana_landscape_uses_aspect_ratio(self, image_tool): + p = image_tool._build_fal_payload("fal-ai/nano-banana", "hello", "landscape") + assert p["aspect_ratio"] == "16:9" + assert "image_size" not in p + + def test_nano_banana_square_uses_aspect_ratio(self, image_tool): + p = image_tool._build_fal_payload("fal-ai/nano-banana", "hello", "square") + assert p["aspect_ratio"] == "1:1" + + def test_nano_banana_portrait_uses_aspect_ratio(self, image_tool): + p = image_tool._build_fal_payload("fal-ai/nano-banana", "hello", "portrait") + assert p["aspect_ratio"] == "9:16" + + +class TestGptLiteralFamily: + """GPT-Image 1.5 uses literal size strings.""" + + def test_gpt_landscape_is_literal(self, image_tool): + p = image_tool._build_fal_payload("fal-ai/gpt-image-1.5", "hello", "landscape") + assert p["image_size"] == "1536x1024" + + def test_gpt_square_is_literal(self, image_tool): + p = image_tool._build_fal_payload("fal-ai/gpt-image-1.5", "hello", "square") + assert p["image_size"] == "1024x1024" + + def test_gpt_portrait_is_literal(self, image_tool): + p = image_tool._build_fal_payload("fal-ai/gpt-image-1.5", "hello", "portrait") + assert p["image_size"] == "1024x1536" + + +# --------------------------------------------------------------------------- +# Supports whitelist — the main safety property +# --------------------------------------------------------------------------- + +class TestSupportsFilter: + """No model should receive keys outside its `supports` set.""" + + def test_payload_keys_are_subset_of_supports_for_all_models(self, image_tool): + for mid, meta in image_tool.FAL_MODELS.items(): + payload = image_tool._build_fal_payload(mid, "test", "landscape", seed=42) + unsupported = set(payload.keys()) - meta["supports"] + assert not unsupported, \ + f"{mid} payload has unsupported keys: {unsupported}" + + def test_gpt_image_has_no_seed_even_if_passed(self, image_tool): + # GPT-Image 1.5 does not support seed — the filter must strip it. + p = image_tool._build_fal_payload("fal-ai/gpt-image-1.5", "hi", "square", seed=42) + assert "seed" not in p + + def test_gpt_image_strips_unsupported_overrides(self, image_tool): + p = image_tool._build_fal_payload( + "fal-ai/gpt-image-1.5", "hi", "square", + overrides={"guidance_scale": 7.5, "num_inference_steps": 50}, + ) + assert "guidance_scale" not in p + assert "num_inference_steps" not in p + + def test_recraft_has_minimal_payload(self, image_tool): + # Recraft supports prompt, image_size, style only. + p = image_tool._build_fal_payload("fal-ai/recraft-v3", "hi", "landscape") + assert set(p.keys()) <= {"prompt", "image_size", "style"} + + def test_nano_banana_never_gets_image_size(self, image_tool): + # Common bug: translator accidentally setting both image_size and aspect_ratio. + p = image_tool._build_fal_payload("fal-ai/nano-banana", "hi", "landscape", seed=1) + assert "image_size" not in p + assert p["aspect_ratio"] == "16:9" + + +# --------------------------------------------------------------------------- +# Default merging +# --------------------------------------------------------------------------- + +class TestDefaults: + """Model-level defaults should carry through unless overridden.""" + + def test_klein_default_steps_is_4(self, image_tool): + p = image_tool._build_fal_payload("fal-ai/flux-2/klein/9b", "hi", "square") + assert p["num_inference_steps"] == 4 + + def test_flux_2_pro_default_steps_is_50(self, image_tool): + p = image_tool._build_fal_payload("fal-ai/flux-2-pro", "hi", "square") + assert p["num_inference_steps"] == 50 + + def test_override_replaces_default(self, image_tool): + p = image_tool._build_fal_payload( + "fal-ai/flux-2-pro", "hi", "square", overrides={"num_inference_steps": 25} + ) + assert p["num_inference_steps"] == 25 + + def test_none_override_does_not_replace_default(self, image_tool): + """None values from caller should be ignored (use default).""" + p = image_tool._build_fal_payload( + "fal-ai/flux-2-pro", "hi", "square", + overrides={"num_inference_steps": None}, + ) + assert p["num_inference_steps"] == 50 + + +# --------------------------------------------------------------------------- +# GPT-Image quality is pinned to medium (not user-configurable) +# --------------------------------------------------------------------------- + +class TestGptQualityPinnedToMedium: + """GPT-Image quality is baked into the FAL_MODELS defaults at 'medium' + and cannot be overridden via config. Pinning keeps Nous Portal billing + predictable across all users.""" + + def test_gpt_payload_always_has_medium_quality(self, image_tool): + p = image_tool._build_fal_payload("fal-ai/gpt-image-1.5", "hi", "square") + assert p["quality"] == "medium" + + def test_config_quality_setting_is_ignored(self, image_tool): + """Even if a user manually edits config.yaml and adds quality_setting, + the payload must still use medium. No code path reads that field.""" + with patch("hermes_cli.config.load_config", + return_value={"image_gen": {"quality_setting": "high"}}): + p = image_tool._build_fal_payload("fal-ai/gpt-image-1.5", "hi", "square") + assert p["quality"] == "medium" + + def test_non_gpt_model_never_gets_quality(self, image_tool): + """quality is only meaningful for gpt-image-1.5 — other models should + never have it in their payload.""" + for mid in image_tool.FAL_MODELS: + if mid == "fal-ai/gpt-image-1.5": + continue + p = image_tool._build_fal_payload(mid, "hi", "square") + assert "quality" not in p, f"{mid} unexpectedly has 'quality' in payload" + + def test_honors_quality_setting_flag_is_removed(self, image_tool): + """The honors_quality_setting flag was the old override trigger. + It must not be present on any model entry anymore.""" + for mid, meta in image_tool.FAL_MODELS.items(): + assert "honors_quality_setting" not in meta, ( + f"{mid} still has honors_quality_setting; " + f"remove it — quality is pinned to medium" + ) + + def test_resolve_gpt_quality_function_is_gone(self, image_tool): + """The _resolve_gpt_quality() helper was removed — quality is now + a static default, not a runtime lookup.""" + assert not hasattr(image_tool, "_resolve_gpt_quality"), ( + "_resolve_gpt_quality should not exist — quality is pinned" + ) + + +# --------------------------------------------------------------------------- +# Model resolution +# --------------------------------------------------------------------------- + +class TestModelResolution: + + def test_no_config_falls_back_to_default(self, image_tool): + with patch("hermes_cli.config.load_config", return_value={}): + mid, meta = image_tool._resolve_fal_model() + assert mid == "fal-ai/flux-2/klein/9b" + + def test_valid_config_model_is_used(self, image_tool): + with patch("hermes_cli.config.load_config", + return_value={"image_gen": {"model": "fal-ai/flux-2-pro"}}): + mid, meta = image_tool._resolve_fal_model() + assert mid == "fal-ai/flux-2-pro" + assert meta["upscale"] is True # flux-2-pro keeps backward-compat upscaling + + def test_unknown_model_falls_back_to_default_with_warning(self, image_tool, caplog): + with patch("hermes_cli.config.load_config", + return_value={"image_gen": {"model": "fal-ai/nonexistent-9000"}}): + mid, _ = image_tool._resolve_fal_model() + assert mid == "fal-ai/flux-2/klein/9b" + + def test_env_var_fallback_when_no_config(self, image_tool, monkeypatch): + monkeypatch.setenv("FAL_IMAGE_MODEL", "fal-ai/z-image/turbo") + with patch("hermes_cli.config.load_config", return_value={}): + mid, _ = image_tool._resolve_fal_model() + assert mid == "fal-ai/z-image/turbo" + + def test_config_wins_over_env_var(self, image_tool, monkeypatch): + monkeypatch.setenv("FAL_IMAGE_MODEL", "fal-ai/z-image/turbo") + with patch("hermes_cli.config.load_config", + return_value={"image_gen": {"model": "fal-ai/nano-banana"}}): + mid, _ = image_tool._resolve_fal_model() + assert mid == "fal-ai/nano-banana" + + +# --------------------------------------------------------------------------- +# Aspect ratio handling +# --------------------------------------------------------------------------- + +class TestAspectRatioNormalization: + + def test_invalid_aspect_defaults_to_landscape(self, image_tool): + p = image_tool._build_fal_payload("fal-ai/flux-2/klein/9b", "hi", "cinemascope") + assert p["image_size"] == "landscape_16_9" + + def test_uppercase_aspect_is_normalized(self, image_tool): + p = image_tool._build_fal_payload("fal-ai/flux-2/klein/9b", "hi", "PORTRAIT") + assert p["image_size"] == "portrait_16_9" + + def test_empty_aspect_defaults_to_landscape(self, image_tool): + p = image_tool._build_fal_payload("fal-ai/flux-2/klein/9b", "hi", "") + assert p["image_size"] == "landscape_16_9" + + +# --------------------------------------------------------------------------- +# Schema + registry integrity +# --------------------------------------------------------------------------- + +class TestRegistryIntegration: + + def test_schema_exposes_only_prompt_and_aspect_ratio_to_agent(self, image_tool): + """The agent-facing schema must stay tight — model selection is a + user-level config choice, not an agent-level arg.""" + props = image_tool.IMAGE_GENERATE_SCHEMA["parameters"]["properties"] + assert set(props.keys()) == {"prompt", "aspect_ratio"} + + def test_aspect_ratio_enum_is_three_values(self, image_tool): + enum = image_tool.IMAGE_GENERATE_SCHEMA["parameters"]["properties"]["aspect_ratio"]["enum"] + assert set(enum) == {"landscape", "square", "portrait"} + + +# --------------------------------------------------------------------------- +# Managed gateway 4xx translation +# --------------------------------------------------------------------------- + +class _MockResponse: + def __init__(self, status_code: int): + self.status_code = status_code + + +class _MockHttpxError(Exception): + """Simulates httpx.HTTPStatusError which exposes .response.status_code.""" + def __init__(self, status_code: int, message: str = "Bad Request"): + super().__init__(message) + self.response = _MockResponse(status_code) + + +class TestExtractHttpStatus: + """Status-code extraction should work across exception shapes.""" + + def test_extracts_from_response_attr(self, image_tool): + exc = _MockHttpxError(403) + assert image_tool._extract_http_status(exc) == 403 + + def test_extracts_from_status_code_attr(self, image_tool): + exc = Exception("fail") + exc.status_code = 404 # type: ignore[attr-defined] + assert image_tool._extract_http_status(exc) == 404 + + def test_returns_none_for_non_http_exception(self, image_tool): + assert image_tool._extract_http_status(ValueError("nope")) is None + assert image_tool._extract_http_status(RuntimeError("nope")) is None + + def test_response_attr_without_status_code_returns_none(self, image_tool): + class OddResponse: + pass + exc = Exception("weird") + exc.response = OddResponse() # type: ignore[attr-defined] + assert image_tool._extract_http_status(exc) is None + + +class TestManagedGatewayErrorTranslation: + """4xx from the Nous managed gateway should be translated to a user-actionable message.""" + + def test_4xx_translates_to_value_error_with_remediation(self, image_tool, monkeypatch): + """403 from managed gateway → ValueError mentioning FAL_KEY + hermes tools.""" + from unittest.mock import MagicMock + + # Simulate: managed mode active, managed submit raises 4xx. + managed_gateway = MagicMock() + managed_gateway.gateway_origin = "https://fal-queue-gateway.example.com" + managed_gateway.nous_user_token = "test-token" + monkeypatch.setattr(image_tool, "_resolve_managed_fal_gateway", + lambda: managed_gateway) + + bad_request = _MockHttpxError(403, "Forbidden") + mock_managed_client = MagicMock() + mock_managed_client.submit.side_effect = bad_request + monkeypatch.setattr(image_tool, "_get_managed_fal_client", + lambda gw: mock_managed_client) + + with pytest.raises(ValueError) as exc_info: + image_tool._submit_fal_request("fal-ai/nano-banana", {"prompt": "x"}) + + msg = str(exc_info.value) + assert "fal-ai/nano-banana" in msg + assert "403" in msg + assert "FAL_KEY" in msg + assert "hermes tools" in msg + # Original exception chained for debugging + assert exc_info.value.__cause__ is bad_request + + def test_5xx_is_not_translated(self, image_tool, monkeypatch): + """500s are real outages, not model-availability issues — don't rewrite them.""" + from unittest.mock import MagicMock + + managed_gateway = MagicMock() + monkeypatch.setattr(image_tool, "_resolve_managed_fal_gateway", + lambda: managed_gateway) + + server_error = _MockHttpxError(502, "Bad Gateway") + mock_managed_client = MagicMock() + mock_managed_client.submit.side_effect = server_error + monkeypatch.setattr(image_tool, "_get_managed_fal_client", + lambda gw: mock_managed_client) + + with pytest.raises(_MockHttpxError): + image_tool._submit_fal_request("fal-ai/flux-2-pro", {"prompt": "x"}) + + def test_direct_fal_errors_are_not_translated(self, image_tool, monkeypatch): + """When user has direct FAL_KEY (managed gateway returns None), raw + errors from fal_client bubble up unchanged — fal_client already + provides reasonable error messages for direct usage.""" + from unittest.mock import MagicMock + + monkeypatch.setattr(image_tool, "_resolve_managed_fal_gateway", + lambda: None) + + direct_error = _MockHttpxError(403, "Forbidden") + fake_fal_client = MagicMock() + fake_fal_client.submit.side_effect = direct_error + monkeypatch.setattr(image_tool, "fal_client", fake_fal_client) + + with pytest.raises(_MockHttpxError): + image_tool._submit_fal_request("fal-ai/flux-2-pro", {"prompt": "x"}) + + def test_non_http_exception_from_managed_bubbles_up(self, image_tool, monkeypatch): + """Connection errors, timeouts, etc. from managed mode aren't 4xx — + they should bubble up unchanged so callers can retry or diagnose.""" + from unittest.mock import MagicMock + + managed_gateway = MagicMock() + monkeypatch.setattr(image_tool, "_resolve_managed_fal_gateway", + lambda: managed_gateway) + + conn_error = ConnectionError("network down") + mock_managed_client = MagicMock() + mock_managed_client.submit.side_effect = conn_error + monkeypatch.setattr(image_tool, "_get_managed_fal_client", + lambda gw: mock_managed_client) + + with pytest.raises(ConnectionError): + image_tool._submit_fal_request("fal-ai/flux-2-pro", {"prompt": "x"}) diff --git a/tools/image_generation_tool.py b/tools/image_generation_tool.py index db2c5254e..8871b8df5 100644 --- a/tools/image_generation_tool.py +++ b/tools/image_generation_tool.py @@ -2,30 +2,22 @@ """ Image Generation Tools Module -This module provides image generation tools using FAL.ai's FLUX 2 Pro model with -automatic upscaling via FAL.ai's Clarity Upscaler for enhanced image quality. +Provides image generation via FAL.ai. Multiple FAL models are supported and +selectable via ``hermes tools`` → Image Generation; the active model is +persisted to ``image_gen.model`` in ``config.yaml``. -Available tools: -- image_generate_tool: Generate images from text prompts with automatic upscaling +Architecture: +- ``FAL_MODELS`` is a catalog of supported models with per-model metadata + (size-style family, defaults, ``supports`` whitelist, upscaler flag). +- ``_build_fal_payload()`` translates the agent's unified inputs (prompt + + aspect_ratio) into the model-specific payload and filters to the + ``supports`` whitelist so models never receive rejected keys. +- Upscaling via FAL's Clarity Upscaler is gated per-model via the ``upscale`` + flag — on for FLUX 2 Pro (backward-compat), off for all faster/newer models + where upscaling would either hurt latency or add marginal quality. -Features: -- High-quality image generation using FLUX 2 Pro model -- Automatic 2x upscaling using Clarity Upscaler for enhanced quality -- Comprehensive parameter control (size, steps, guidance, etc.) -- Proper error handling and validation with fallback to original images -- Debug logging support -- Sync mode for immediate results - -Usage: - from image_generation_tool import image_generate_tool - import asyncio - - # Generate and automatically upscale an image - result = await image_generate_tool( - prompt="A serene mountain landscape with cherry blossoms", - image_size="landscape_4_3", - num_images=1 - ) +Pricing shown in UI strings is as-of the initial commit; we accept drift and +update when it's noticed. """ import json @@ -34,35 +26,237 @@ import os import datetime import threading import uuid -from typing import Dict, Any, Optional, Union +from typing import Any, Dict, Optional, Union from urllib.parse import urlencode + import fal_client + from tools.debug_helpers import DebugSession from tools.managed_tool_gateway import resolve_managed_tool_gateway from tools.tool_backend_helpers import managed_nous_tools_enabled, prefers_gateway logger = logging.getLogger(__name__) -# Configuration for image generation -DEFAULT_MODEL = "fal-ai/flux-2-pro" -DEFAULT_ASPECT_RATIO = "landscape" -DEFAULT_NUM_INFERENCE_STEPS = 50 -DEFAULT_GUIDANCE_SCALE = 4.5 -DEFAULT_NUM_IMAGES = 1 -DEFAULT_OUTPUT_FORMAT = "png" -# Safety settings -ENABLE_SAFETY_CHECKER = False -SAFETY_TOLERANCE = "5" # Maximum tolerance (1-5, where 5 is most permissive) +# --------------------------------------------------------------------------- +# FAL model catalog +# --------------------------------------------------------------------------- +# +# Each entry declares how to translate our unified inputs into the model's +# native payload shape. Size specification falls into three families: +# +# "image_size_preset" — preset enum ("square_hd", "landscape_16_9", ...) +# used by the flux family, z-image, qwen, recraft, +# ideogram. +# "aspect_ratio" — aspect ratio enum ("16:9", "1:1", ...) used by +# nano-banana (Gemini). +# "gpt_literal" — literal dimension strings ("1024x1024", etc.) +# used by gpt-image-1.5. +# +# ``supports`` is a whitelist of keys allowed in the outgoing payload — any +# key outside this set is stripped before submission so models never receive +# rejected parameters (each FAL model rejects unknown keys differently). +# +# ``upscale`` controls whether to chain Clarity Upscaler after generation. -# Aspect ratio mapping - simplified choices for model to select -ASPECT_RATIO_MAP = { - "landscape": "landscape_16_9", - "square": "square_hd", - "portrait": "portrait_16_9" +FAL_MODELS: Dict[str, Dict[str, Any]] = { + "fal-ai/flux-2/klein/9b": { + "display": "FLUX 2 Klein 9B", + "speed": "<1s", + "strengths": "Fast, crisp text", + "price": "$0.006/MP", + "size_style": "image_size_preset", + "sizes": { + "landscape": "landscape_16_9", + "square": "square_hd", + "portrait": "portrait_16_9", + }, + "defaults": { + "num_inference_steps": 4, + "output_format": "png", + "enable_safety_checker": False, + }, + "supports": { + "prompt", "image_size", "num_inference_steps", "seed", + "output_format", "enable_safety_checker", + }, + "upscale": False, + }, + "fal-ai/flux-2-pro": { + "display": "FLUX 2 Pro", + "speed": "~6s", + "strengths": "Studio photorealism", + "price": "$0.03/MP", + "size_style": "image_size_preset", + "sizes": { + "landscape": "landscape_16_9", + "square": "square_hd", + "portrait": "portrait_16_9", + }, + "defaults": { + "num_inference_steps": 50, + "guidance_scale": 4.5, + "num_images": 1, + "output_format": "png", + "enable_safety_checker": False, + "safety_tolerance": "5", + "sync_mode": True, + }, + "supports": { + "prompt", "image_size", "num_inference_steps", "guidance_scale", + "num_images", "output_format", "enable_safety_checker", + "safety_tolerance", "sync_mode", "seed", + }, + "upscale": True, # Backward-compat: current default behavior. + }, + "fal-ai/z-image/turbo": { + "display": "Z-Image Turbo", + "speed": "~2s", + "strengths": "Bilingual EN/CN, 6B", + "price": "$0.005/MP", + "size_style": "image_size_preset", + "sizes": { + "landscape": "landscape_16_9", + "square": "square_hd", + "portrait": "portrait_16_9", + }, + "defaults": { + "num_inference_steps": 8, + "num_images": 1, + "output_format": "png", + "enable_safety_checker": False, + "enable_prompt_expansion": False, # avoid the extra per-request charge + }, + "supports": { + "prompt", "image_size", "num_inference_steps", "num_images", + "seed", "output_format", "enable_safety_checker", + "enable_prompt_expansion", + }, + "upscale": False, + }, + "fal-ai/nano-banana": { + "display": "Nano Banana (Gemini 2.5 Flash Image)", + "speed": "~6s", + "strengths": "Gemini 2.5, consistency", + "price": "$0.08/image", + "size_style": "aspect_ratio", + "sizes": { + "landscape": "16:9", + "square": "1:1", + "portrait": "9:16", + }, + "defaults": { + "num_images": 1, + "output_format": "png", + "safety_tolerance": "5", + }, + "supports": { + "prompt", "aspect_ratio", "num_images", "output_format", + "safety_tolerance", "seed", "sync_mode", + }, + "upscale": False, + }, + "fal-ai/gpt-image-1.5": { + "display": "GPT Image 1.5", + "speed": "~15s", + "strengths": "Prompt adherence", + "price": "$0.034/image", + "size_style": "gpt_literal", + "sizes": { + "landscape": "1536x1024", + "square": "1024x1024", + "portrait": "1024x1536", + }, + "defaults": { + # Quality is pinned to medium to keep portal billing predictable + # across all users (low is too rough, high is 4-6x more expensive). + "quality": "medium", + "num_images": 1, + "output_format": "png", + }, + "supports": { + "prompt", "image_size", "quality", "num_images", "output_format", + "background", "sync_mode", + }, + "upscale": False, + }, + "fal-ai/ideogram/v3": { + "display": "Ideogram V3", + "speed": "~5s", + "strengths": "Best typography", + "price": "$0.03-0.09/image", + "size_style": "image_size_preset", + "sizes": { + "landscape": "landscape_16_9", + "square": "square_hd", + "portrait": "portrait_16_9", + }, + "defaults": { + "rendering_speed": "BALANCED", + "expand_prompt": True, + "style": "AUTO", + }, + "supports": { + "prompt", "image_size", "rendering_speed", "expand_prompt", + "style", "seed", + }, + "upscale": False, + }, + "fal-ai/recraft-v3": { + "display": "Recraft V3", + "speed": "~8s", + "strengths": "Vector, brand styles", + "price": "$0.04/image", + "size_style": "image_size_preset", + "sizes": { + "landscape": "landscape_16_9", + "square": "square_hd", + "portrait": "portrait_16_9", + }, + "defaults": { + "style": "realistic_image", + }, + "supports": { + "prompt", "image_size", "style", + }, + "upscale": False, + }, + "fal-ai/qwen-image": { + "display": "Qwen Image", + "speed": "~12s", + "strengths": "LLM-based, complex text", + "price": "$0.02/MP", + "size_style": "image_size_preset", + "sizes": { + "landscape": "landscape_16_9", + "square": "square_hd", + "portrait": "portrait_16_9", + }, + "defaults": { + "num_inference_steps": 30, + "guidance_scale": 2.5, + "num_images": 1, + "output_format": "png", + "acceleration": "regular", + }, + "supports": { + "prompt", "image_size", "num_inference_steps", "guidance_scale", + "num_images", "output_format", "acceleration", "seed", "sync_mode", + }, + "upscale": False, + }, } -# Configuration for automatic upscaling +# Default model is the fastest reasonable option. Kept cheap and sub-1s. +DEFAULT_MODEL = "fal-ai/flux-2/klein/9b" + +DEFAULT_ASPECT_RATIO = "landscape" +VALID_ASPECT_RATIOS = ("landscape", "square", "portrait") + + +# --------------------------------------------------------------------------- +# Upscaler (Clarity Upscaler — unchanged from previous implementation) +# --------------------------------------------------------------------------- UPSCALER_MODEL = "fal-ai/clarity-upscaler" UPSCALER_FACTOR = 2 UPSCALER_SAFETY_CHECKER = False @@ -73,12 +267,6 @@ UPSCALER_RESEMBLANCE = 0.6 UPSCALER_GUIDANCE_SCALE = 4 UPSCALER_NUM_INFERENCE_STEPS = 18 -# Valid parameter values for validation based on FLUX 2 Pro documentation -VALID_IMAGE_SIZES = [ - "square_hd", "square", "portrait_4_3", "portrait_16_9", "landscape_4_3", "landscape_16_9" -] -VALID_OUTPUT_FORMATS = ["jpeg", "png"] -VALID_ACCELERATION_MODES = ["none", "regular", "high"] _debug = DebugSession("image_tools", env_var="IMAGE_TOOLS_DEBUG") _managed_fal_client = None @@ -86,6 +274,9 @@ _managed_fal_client_config = None _managed_fal_client_lock = threading.Lock() +# --------------------------------------------------------------------------- +# Managed FAL gateway (Nous Subscription) +# --------------------------------------------------------------------------- def _resolve_managed_fal_gateway(): """Return managed fal-queue gateway config when the user prefers the gateway or direct FAL credentials are absent.""" @@ -208,104 +399,140 @@ def _submit_fal_request(model: str, arguments: Dict[str, Any]): return fal_client.submit(model, arguments=arguments, headers=request_headers) managed_client = _get_managed_fal_client(managed_gateway) - return managed_client.submit( - model, - arguments=arguments, - headers=request_headers, - ) + try: + return managed_client.submit( + model, + arguments=arguments, + headers=request_headers, + ) + except Exception as exc: + # 4xx from the managed gateway typically means the portal doesn't + # currently proxy this model (allowlist miss, billing gate, etc.) + # — surface a clearer message with actionable remediation instead + # of a raw HTTP error from httpx. + status = _extract_http_status(exc) + if status is not None and 400 <= status < 500: + raise ValueError( + f"Nous Subscription gateway rejected model '{model}' " + f"(HTTP {status}). This model may not yet be enabled on " + f"the Nous Portal's FAL proxy. Either:\n" + f" • Set FAL_KEY in your environment to use FAL.ai directly, or\n" + f" • Pick a different model via `hermes tools` → Image Generation." + ) from exc + raise -def _validate_parameters( - image_size: Union[str, Dict[str, int]], - num_inference_steps: int, - guidance_scale: float, - num_images: int, - output_format: str, - acceleration: str = "none" +def _extract_http_status(exc: BaseException) -> Optional[int]: + """Return an HTTP status code from httpx/fal exceptions, else None. + + Defensive across exception shapes — httpx.HTTPStatusError exposes + ``.response.status_code`` while fal_client wrappers may expose + ``.status_code`` directly. + """ + response = getattr(exc, "response", None) + if response is not None: + status = getattr(response, "status_code", None) + if isinstance(status, int): + return status + status = getattr(exc, "status_code", None) + if isinstance(status, int): + return status + return None + + +# --------------------------------------------------------------------------- +# Model resolution + payload construction +# --------------------------------------------------------------------------- +def _resolve_fal_model() -> tuple: + """Resolve the active FAL model from config.yaml (primary) or default. + + Returns (model_id, metadata_dict). Falls back to DEFAULT_MODEL if the + configured model is unknown (logged as a warning). + """ + model_id = "" + try: + from hermes_cli.config import load_config + cfg = load_config() + img_cfg = cfg.get("image_gen") if isinstance(cfg, dict) else None + if isinstance(img_cfg, dict): + raw = img_cfg.get("model") + if isinstance(raw, str): + model_id = raw.strip() + except Exception as exc: + logger.debug("Could not load image_gen.model from config: %s", exc) + + # Env var escape hatch (undocumented; backward-compat for tests/scripts). + if not model_id: + model_id = os.getenv("FAL_IMAGE_MODEL", "").strip() + + if not model_id: + return DEFAULT_MODEL, FAL_MODELS[DEFAULT_MODEL] + + if model_id not in FAL_MODELS: + logger.warning( + "Unknown FAL model '%s' in config; falling back to %s", + model_id, DEFAULT_MODEL, + ) + return DEFAULT_MODEL, FAL_MODELS[DEFAULT_MODEL] + + return model_id, FAL_MODELS[model_id] + + +def _build_fal_payload( + model_id: str, + prompt: str, + aspect_ratio: str = DEFAULT_ASPECT_RATIO, + seed: Optional[int] = None, + overrides: Optional[Dict[str, Any]] = None, ) -> Dict[str, Any]: + """Build a FAL request payload for `model_id` from unified inputs. + + Translates aspect_ratio into the model's native size spec (preset enum, + aspect-ratio enum, or GPT literal string), merges model defaults, applies + caller overrides, then filters to the model's ``supports`` whitelist. """ - Validate and normalize image generation parameters for FLUX 2 Pro model. - - Args: - image_size: Either a preset string or custom size dict - num_inference_steps: Number of inference steps - guidance_scale: Guidance scale value - num_images: Number of images to generate - output_format: Output format for images - acceleration: Acceleration mode for generation speed - - Returns: - Dict[str, Any]: Validated and normalized parameters - - Raises: - ValueError: If any parameter is invalid - """ - validated = {} - - # Validate image_size - if isinstance(image_size, str): - if image_size not in VALID_IMAGE_SIZES: - raise ValueError(f"Invalid image_size '{image_size}'. Must be one of: {VALID_IMAGE_SIZES}") - validated["image_size"] = image_size - elif isinstance(image_size, dict): - if "width" not in image_size or "height" not in image_size: - raise ValueError("Custom image_size must contain 'width' and 'height' keys") - if not isinstance(image_size["width"], int) or not isinstance(image_size["height"], int): - raise ValueError("Custom image_size width and height must be integers") - if image_size["width"] < 64 or image_size["height"] < 64: - raise ValueError("Custom image_size dimensions must be at least 64x64") - if image_size["width"] > 2048 or image_size["height"] > 2048: - raise ValueError("Custom image_size dimensions must not exceed 2048x2048") - validated["image_size"] = image_size + meta = FAL_MODELS[model_id] + size_style = meta["size_style"] + sizes = meta["sizes"] + + aspect = (aspect_ratio or DEFAULT_ASPECT_RATIO).lower().strip() + if aspect not in sizes: + aspect = DEFAULT_ASPECT_RATIO + + payload: Dict[str, Any] = dict(meta.get("defaults", {})) + payload["prompt"] = (prompt or "").strip() + + if size_style in ("image_size_preset", "gpt_literal"): + payload["image_size"] = sizes[aspect] + elif size_style == "aspect_ratio": + payload["aspect_ratio"] = sizes[aspect] else: - raise ValueError("image_size must be either a preset string or a dict with width/height") - - # Validate num_inference_steps - if not isinstance(num_inference_steps, int) or num_inference_steps < 1 or num_inference_steps > 100: - raise ValueError("num_inference_steps must be an integer between 1 and 100") - validated["num_inference_steps"] = num_inference_steps - - # Validate guidance_scale (FLUX 2 Pro default is 4.5) - if not isinstance(guidance_scale, (int, float)) or guidance_scale < 0.1 or guidance_scale > 20.0: - raise ValueError("guidance_scale must be a number between 0.1 and 20.0") - validated["guidance_scale"] = float(guidance_scale) - - # Validate num_images - if not isinstance(num_images, int) or num_images < 1 or num_images > 4: - raise ValueError("num_images must be an integer between 1 and 4") - validated["num_images"] = num_images - - # Validate output_format - if output_format not in VALID_OUTPUT_FORMATS: - raise ValueError(f"Invalid output_format '{output_format}'. Must be one of: {VALID_OUTPUT_FORMATS}") - validated["output_format"] = output_format - - # Validate acceleration - if acceleration not in VALID_ACCELERATION_MODES: - raise ValueError(f"Invalid acceleration '{acceleration}'. Must be one of: {VALID_ACCELERATION_MODES}") - validated["acceleration"] = acceleration - - return validated + raise ValueError(f"Unknown size_style: {size_style!r}") + + if seed is not None and isinstance(seed, int): + payload["seed"] = seed + + if overrides: + for k, v in overrides.items(): + if v is not None: + payload[k] = v + + supports = meta["supports"] + return {k: v for k, v in payload.items() if k in supports} -def _upscale_image(image_url: str, original_prompt: str) -> Dict[str, Any]: - """ - Upscale an image using FAL.ai's Clarity Upscaler. - - Uses the synchronous fal_client API to avoid event loop lifecycle issues - when called from threaded contexts (e.g. gateway thread pool). - - Args: - image_url (str): URL of the image to upscale - original_prompt (str): Original prompt used to generate the image - - Returns: - Dict[str, Any]: Upscaled image data or None if upscaling fails +# --------------------------------------------------------------------------- +# Upscaler +# --------------------------------------------------------------------------- +def _upscale_image(image_url: str, original_prompt: str) -> Optional[Dict[str, Any]]: + """Upscale an image using FAL.ai's Clarity Upscaler. + + Returns upscaled image dict, or None on failure (caller falls back to + the original image). """ try: logger.info("Upscaling image with Clarity Upscaler...") - - # Prepare arguments for upscaler + upscaler_arguments = { "image_url": image_url, "prompt": f"{UPSCALER_DEFAULT_PROMPT}, {original_prompt}", @@ -315,329 +542,239 @@ def _upscale_image(image_url: str, original_prompt: str) -> Dict[str, Any]: "resemblance": UPSCALER_RESEMBLANCE, "guidance_scale": UPSCALER_GUIDANCE_SCALE, "num_inference_steps": UPSCALER_NUM_INFERENCE_STEPS, - "enable_safety_checker": UPSCALER_SAFETY_CHECKER + "enable_safety_checker": UPSCALER_SAFETY_CHECKER, } - - # Use sync API — fal_client.submit() uses httpx.Client (no event loop). - # The async API (submit_async) caches a global httpx.AsyncClient via - # @cached_property, which breaks when asyncio.run() destroys the loop - # between calls (gateway thread-pool pattern). - handler = _submit_fal_request( - UPSCALER_MODEL, - arguments=upscaler_arguments, - ) - - # Get the upscaled result (sync — blocks until done) + + handler = _submit_fal_request(UPSCALER_MODEL, arguments=upscaler_arguments) result = handler.get() - + if result and "image" in result: upscaled_image = result["image"] - logger.info("Image upscaled successfully to %sx%s", upscaled_image.get('width', 'unknown'), upscaled_image.get('height', 'unknown')) + logger.info( + "Image upscaled successfully to %sx%s", + upscaled_image.get("width", "unknown"), + upscaled_image.get("height", "unknown"), + ) return { "url": upscaled_image["url"], "width": upscaled_image.get("width", 0), "height": upscaled_image.get("height", 0), "upscaled": True, - "upscale_factor": UPSCALER_FACTOR + "upscale_factor": UPSCALER_FACTOR, } - else: - logger.error("Upscaler returned invalid response") - return None - + logger.error("Upscaler returned invalid response") + return None + except Exception as e: logger.error("Error upscaling image: %s", e, exc_info=True) return None +# --------------------------------------------------------------------------- +# Tool entry point +# --------------------------------------------------------------------------- def image_generate_tool( prompt: str, aspect_ratio: str = DEFAULT_ASPECT_RATIO, - num_inference_steps: int = DEFAULT_NUM_INFERENCE_STEPS, - guidance_scale: float = DEFAULT_GUIDANCE_SCALE, - num_images: int = DEFAULT_NUM_IMAGES, - output_format: str = DEFAULT_OUTPUT_FORMAT, - seed: Optional[int] = None + num_inference_steps: Optional[int] = None, + guidance_scale: Optional[float] = None, + num_images: Optional[int] = None, + output_format: Optional[str] = None, + seed: Optional[int] = None, ) -> str: + """Generate an image from a text prompt using the configured FAL model. + + The agent-facing schema exposes only ``prompt`` and ``aspect_ratio``; the + remaining kwargs are overrides for direct Python callers and are filtered + per-model via the ``supports`` whitelist (unsupported overrides are + silently dropped so legacy callers don't break when switching models). + + Returns a JSON string with ``{"success": bool, "image": url | None, + "error": str, "error_type": str}``. """ - Generate images from text prompts using FAL.ai's FLUX 2 Pro model with automatic upscaling. - - Uses the synchronous fal_client API to avoid event loop lifecycle issues. - The async API's global httpx.AsyncClient (cached via @cached_property) breaks - when asyncio.run() destroys and recreates event loops between calls, which - happens in the gateway's thread-pool pattern. - - Args: - prompt (str): The text prompt describing the desired image - aspect_ratio (str): Image aspect ratio - "landscape", "square", or "portrait" (default: "landscape") - num_inference_steps (int): Number of denoising steps (1-50, default: 50) - guidance_scale (float): How closely to follow prompt (0.1-20.0, default: 4.5) - num_images (int): Number of images to generate (1-4, default: 1) - output_format (str): Image format "jpeg" or "png" (default: "png") - seed (Optional[int]): Random seed for reproducible results (optional) - - Returns: - str: JSON string containing minimal generation results: - { - "success": bool, - "image": str or None # URL of the upscaled image, or None if failed - } - """ - # Validate and map aspect_ratio to actual image_size - aspect_ratio_lower = aspect_ratio.lower().strip() if aspect_ratio else DEFAULT_ASPECT_RATIO - if aspect_ratio_lower not in ASPECT_RATIO_MAP: - logger.warning("Invalid aspect_ratio '%s', defaulting to '%s'", aspect_ratio, DEFAULT_ASPECT_RATIO) - aspect_ratio_lower = DEFAULT_ASPECT_RATIO - image_size = ASPECT_RATIO_MAP[aspect_ratio_lower] - + model_id, meta = _resolve_fal_model() + debug_call_data = { + "model": model_id, "parameters": { "prompt": prompt, "aspect_ratio": aspect_ratio, - "image_size": image_size, "num_inference_steps": num_inference_steps, "guidance_scale": guidance_scale, "num_images": num_images, "output_format": output_format, - "seed": seed + "seed": seed, }, "error": None, "success": False, "images_generated": 0, - "generation_time": 0 + "generation_time": 0, } - + start_time = datetime.datetime.now() - + try: - logger.info("Generating %s image(s) with FLUX 2 Pro: %s", num_images, prompt[:80]) - - # Validate prompt if not prompt or not isinstance(prompt, str) or len(prompt.strip()) == 0: raise ValueError("Prompt is required and must be a non-empty string") - - # Check API key availability + if not (os.getenv("FAL_KEY") or _resolve_managed_fal_gateway()): message = "FAL_KEY environment variable not set" if managed_nous_tools_enabled(): message += " and managed FAL gateway is unavailable" raise ValueError(message) - - # Validate other parameters - validated_params = _validate_parameters( - image_size, num_inference_steps, guidance_scale, num_images, output_format, "none" + + aspect_lc = (aspect_ratio or DEFAULT_ASPECT_RATIO).lower().strip() + if aspect_lc not in VALID_ASPECT_RATIOS: + logger.warning( + "Invalid aspect_ratio '%s', defaulting to '%s'", + aspect_ratio, DEFAULT_ASPECT_RATIO, + ) + aspect_lc = DEFAULT_ASPECT_RATIO + + overrides: Dict[str, Any] = {} + if num_inference_steps is not None: + overrides["num_inference_steps"] = num_inference_steps + if guidance_scale is not None: + overrides["guidance_scale"] = guidance_scale + if num_images is not None: + overrides["num_images"] = num_images + if output_format is not None: + overrides["output_format"] = output_format + + arguments = _build_fal_payload( + model_id, prompt, aspect_lc, seed=seed, overrides=overrides, ) - - # Prepare arguments for FAL.ai FLUX 2 Pro API - arguments = { - "prompt": prompt.strip(), - "image_size": validated_params["image_size"], - "num_inference_steps": validated_params["num_inference_steps"], - "guidance_scale": validated_params["guidance_scale"], - "num_images": validated_params["num_images"], - "output_format": validated_params["output_format"], - "enable_safety_checker": ENABLE_SAFETY_CHECKER, - "safety_tolerance": SAFETY_TOLERANCE, - "sync_mode": True # Use sync mode for immediate results - } - - # Add seed if provided - if seed is not None and isinstance(seed, int): - arguments["seed"] = seed - - logger.info("Submitting generation request to FAL.ai FLUX 2 Pro...") - logger.info(" Model: %s", DEFAULT_MODEL) - logger.info(" Aspect Ratio: %s -> %s", aspect_ratio_lower, image_size) - logger.info(" Steps: %s", validated_params['num_inference_steps']) - logger.info(" Guidance: %s", validated_params['guidance_scale']) - - # Submit request to FAL.ai using sync API (avoids cached event loop issues) - handler = _submit_fal_request( - DEFAULT_MODEL, - arguments=arguments, + + logger.info( + "Generating image with %s (%s) — prompt: %s", + meta.get("display", model_id), model_id, prompt[:80], ) - - # Get the result (sync — blocks until done) + + handler = _submit_fal_request(model_id, arguments=arguments) result = handler.get() - + generation_time = (datetime.datetime.now() - start_time).total_seconds() - - # Process the response + if not result or "images" not in result: - raise ValueError("Invalid response from FAL.ai API - no images returned") - + raise ValueError("Invalid response from FAL.ai API — no images returned") + images = result.get("images", []) if not images: raise ValueError("No images were generated") - - # Format image data and upscale images + + should_upscale = bool(meta.get("upscale", False)) + formatted_images = [] for img in images: - if isinstance(img, dict) and "url" in img: - original_image = { - "url": img["url"], - "width": img.get("width", 0), - "height": img.get("height", 0) - } - - # Attempt to upscale the image + if not (isinstance(img, dict) and "url" in img): + continue + original_image = { + "url": img["url"], + "width": img.get("width", 0), + "height": img.get("height", 0), + } + + if should_upscale: upscaled_image = _upscale_image(img["url"], prompt.strip()) - if upscaled_image: - # Use upscaled image if successful formatted_images.append(upscaled_image) - else: - # Fall back to original image if upscaling fails - logger.warning("Using original image as fallback") - original_image["upscaled"] = False - formatted_images.append(original_image) - + continue + logger.warning("Using original image as fallback (upscale failed)") + + original_image["upscaled"] = False + formatted_images.append(original_image) + if not formatted_images: raise ValueError("No valid image URLs returned from API") - - upscaled_count = sum(1 for img in formatted_images if img.get("upscaled", False)) - logger.info("Generated %s image(s) in %.1fs (%s upscaled)", len(formatted_images), generation_time, upscaled_count) - - # Prepare successful response - minimal format + + upscaled_count = sum(1 for img in formatted_images if img.get("upscaled")) + logger.info( + "Generated %s image(s) in %.1fs (%s upscaled) via %s", + len(formatted_images), generation_time, upscaled_count, model_id, + ) + response_data = { "success": True, - "image": formatted_images[0]["url"] if formatted_images else None + "image": formatted_images[0]["url"] if formatted_images else None, } - + debug_call_data["success"] = True debug_call_data["images_generated"] = len(formatted_images) debug_call_data["generation_time"] = generation_time - - # Log debug information _debug.log_call("image_generate_tool", debug_call_data) _debug.save() - + return json.dumps(response_data, indent=2, ensure_ascii=False) - + except Exception as e: generation_time = (datetime.datetime.now() - start_time).total_seconds() error_msg = f"Error generating image: {str(e)}" logger.error("%s", error_msg, exc_info=True) - - # Include error details so callers can diagnose failures + response_data = { "success": False, "image": None, "error": str(e), "error_type": type(e).__name__, } - + debug_call_data["error"] = error_msg debug_call_data["generation_time"] = generation_time _debug.log_call("image_generate_tool", debug_call_data) _debug.save() - + return json.dumps(response_data, indent=2, ensure_ascii=False) def check_fal_api_key() -> bool: - """ - Check if the FAL.ai API key is available in environment variables. - - Returns: - bool: True if API key is set, False otherwise - """ + """True if the FAL.ai API key (direct or managed gateway) is available.""" return bool(os.getenv("FAL_KEY") or _resolve_managed_fal_gateway()) def check_image_generation_requirements() -> bool: - """ - Check if all requirements for image generation tools are met. - - Returns: - bool: True if requirements are met, False otherwise - """ + """True if FAL credentials and fal_client SDK are both available.""" try: - # Check API key if not check_fal_api_key(): return False - - # Check if fal_client is available import fal_client # noqa: F401 — SDK presence check return True - except ImportError: return False - +# --------------------------------------------------------------------------- +# Demo / CLI entry point +# --------------------------------------------------------------------------- if __name__ == "__main__": - """ - Simple test/demo when run directly - """ - print("🎨 Image Generation Tools Module - FLUX 2 Pro + Auto Upscaling") + print("🎨 Image Generation Tools — FAL.ai multi-model support") print("=" * 60) - - # Check if API key is available - api_available = check_fal_api_key() - - if not api_available: + + if not check_fal_api_key(): print("❌ FAL_KEY environment variable not set") - print("Please set your API key: export FAL_KEY='your-key-here'") - print("Get API key at: https://fal.ai/") - exit(1) - else: - print("✅ FAL.ai API key found") - - # Check if fal_client is available + print(" Set it via: export FAL_KEY='your-key-here'") + print(" Get a key: https://fal.ai/") + raise SystemExit(1) + print("✅ FAL.ai API key found") + try: - import fal_client + import fal_client # noqa: F401 print("✅ fal_client library available") except ImportError: - print("❌ fal_client library not found") - print("Please install: pip install fal-client") - exit(1) - - print("🛠️ Image generation tools ready for use!") - print(f"🤖 Using model: {DEFAULT_MODEL}") - print(f"🔍 Auto-upscaling with: {UPSCALER_MODEL} ({UPSCALER_FACTOR}x)") - - # Show debug mode status + print("❌ fal_client library not found — pip install fal-client") + raise SystemExit(1) + + model_id, meta = _resolve_fal_model() + print(f"🤖 Active model: {meta.get('display', model_id)} ({model_id})") + print(f" Speed: {meta.get('speed', '?')} · Price: {meta.get('price', '?')}") + print(f" Upscaler: {'on' if meta.get('upscale') else 'off'}") + + print("\nAvailable models:") + for mid, m in FAL_MODELS.items(): + marker = " ← active" if mid == model_id else "" + print(f" {mid:<32} {m.get('speed', '?'):<6} {m.get('price', '?')}{marker}") + if _debug.active: - print(f"🐛 Debug mode ENABLED - Session ID: {_debug.session_id}") - print(f" Debug logs will be saved to: ./logs/image_tools_debug_{_debug.session_id}.json") - else: - print("🐛 Debug mode disabled (set IMAGE_TOOLS_DEBUG=true to enable)") - - print("\nBasic usage:") - print(" from image_generation_tool import image_generate_tool") - print(" import asyncio") - print("") - print(" async def main():") - print(" # Generate image with automatic 2x upscaling") - print(" result = await image_generate_tool(") - print(" prompt='A serene mountain landscape with cherry blossoms',") - print(" image_size='landscape_4_3',") - print(" num_images=1") - print(" )") - print(" print(result)") - print(" asyncio.run(main())") - - print("\nSupported image sizes:") - for size in VALID_IMAGE_SIZES: - print(f" - {size}") - print(" - Custom: {'width': 512, 'height': 768} (if needed)") - - print("\nAcceleration modes:") - for mode in VALID_ACCELERATION_MODES: - print(f" - {mode}") - - print("\nExample prompts:") - print(" - 'A candid street photo of a woman with a pink bob and bold eyeliner'") - print(" - 'Modern architecture building with glass facade, sunset lighting'") - print(" - 'Abstract art with vibrant colors and geometric patterns'") - print(" - 'Portrait of a wise old owl perched on ancient tree branch'") - print(" - 'Futuristic cityscape with flying cars and neon lights'") - - print("\nDebug mode:") - print(" # Enable debug logging") - print(" export IMAGE_TOOLS_DEBUG=true") - print(" # Debug logs capture all image generation calls and results") - print(" # Logs saved to: ./logs/image_tools_debug_UUID.json") + print(f"\n🐛 Debug mode enabled — session {_debug.session_id}") # --------------------------------------------------------------------------- @@ -647,23 +784,28 @@ from tools.registry import registry, tool_error IMAGE_GENERATE_SCHEMA = { "name": "image_generate", - "description": "Generate high-quality images from text prompts using FLUX 2 Pro model with automatic 2x upscaling. Creates detailed, artistic images that are automatically upscaled for hi-rez results. Returns a single upscaled image URL. Display it using markdown: ![description](URL)", + "description": ( + "Generate high-quality images from text prompts using FAL.ai. " + "The underlying model is user-configured (default: FLUX 2 Klein 9B, " + "sub-1s generation) and is not selectable by the agent. Returns a " + "single image URL. Display it using markdown: ![description](URL)" + ), "parameters": { "type": "object", "properties": { "prompt": { "type": "string", - "description": "The text prompt describing the desired image. Be detailed and descriptive." + "description": "The text prompt describing the desired image. Be detailed and descriptive.", }, "aspect_ratio": { "type": "string", - "enum": ["landscape", "square", "portrait"], + "enum": list(VALID_ASPECT_RATIOS), "description": "The aspect ratio of the generated image. 'landscape' is 16:9 wide, 'portrait' is 16:9 tall, 'square' is 1:1.", - "default": "landscape" - } + "default": DEFAULT_ASPECT_RATIO, + }, }, - "required": ["prompt"] - } + "required": ["prompt"], + }, } @@ -673,12 +815,7 @@ def _handle_image_generate(args, **kw): return tool_error("prompt is required for image generation") return image_generate_tool( prompt=prompt, - aspect_ratio=args.get("aspect_ratio", "landscape"), - num_inference_steps=50, - guidance_scale=4.5, - num_images=1, - output_format="png", - seed=None, + aspect_ratio=args.get("aspect_ratio", DEFAULT_ASPECT_RATIO), ) @@ -689,6 +826,6 @@ registry.register( handler=_handle_image_generate, check_fn=check_image_generation_requirements, requires_env=[], - is_async=False, # Switched to sync fal_client API to fix "Event loop is closed" in gateway + is_async=False, # sync fal_client API to avoid "Event loop is closed" in gateway emoji="🎨", ) diff --git a/website/docs/reference/tools-reference.md b/website/docs/reference/tools-reference.md index 56c47f833..e1138dc00 100644 --- a/website/docs/reference/tools-reference.md +++ b/website/docs/reference/tools-reference.md @@ -79,7 +79,7 @@ In addition to built-in tools, Hermes can load tools dynamically from MCP server | Tool | Description | Requires environment | |------|-------------|----------------------| -| `image_generate` | Generate high-quality images from text prompts using FLUX 2 Pro model with automatic 2x upscaling. Creates detailed, artistic images that are automatically upscaled for hi-rez results. Returns a single upscaled image URL. Display it using… | FAL_KEY | +| `image_generate` | Generate high-quality images from text prompts using FAL.ai. The underlying model is user-configured (default: FLUX 2 Klein 9B, sub-1s generation) and is not selectable by the agent. Returns a single image URL. Display it using… | FAL_KEY | ## `memory` toolset diff --git a/website/docs/user-guide/features/image-generation.md b/website/docs/user-guide/features/image-generation.md index eea563c44..701d4a4fa 100644 --- a/website/docs/user-guide/features/image-generation.md +++ b/website/docs/user-guide/features/image-generation.md @@ -1,18 +1,35 @@ --- title: Image Generation -description: Generate high-quality images using FLUX 2 Pro with automatic upscaling via FAL.ai. +description: Generate images via FAL.ai — 8 models including FLUX 2, GPT-Image, Nano Banana, Ideogram, and more, selectable via `hermes tools`. sidebar_label: Image Generation sidebar_position: 6 --- # Image Generation -Hermes Agent can generate images from text prompts using FAL.ai's **FLUX 2 Pro** model with automatic 2x upscaling via the **Clarity Upscaler** for enhanced quality. +Hermes Agent generates images from text prompts via FAL.ai. Eight models are supported out of the box, each with different speed, quality, and cost tradeoffs. The active model is user-configurable via `hermes tools` and persists in `config.yaml`. + +## Supported Models + +| Model | Speed | Strengths | Price | +|---|---|---|---| +| `fal-ai/flux-2/klein/9b` *(default)* | <1s | Fast, crisp text | $0.006/MP | +| `fal-ai/flux-2-pro` | ~6s | Studio photorealism | $0.03/MP | +| `fal-ai/z-image/turbo` | ~2s | Bilingual EN/CN, 6B params | $0.005/MP | +| `fal-ai/nano-banana` | ~6s | Gemini 2.5, character consistency | $0.08/image | +| `fal-ai/gpt-image-1.5` | ~15s | Prompt adherence | $0.034/image | +| `fal-ai/ideogram/v3` | ~5s | Best typography | $0.03–0.09/image | +| `fal-ai/recraft-v3` | ~8s | Vector art, brand styles | $0.04/image | +| `fal-ai/qwen-image` | ~12s | LLM-based, complex text | $0.02/MP | + +Prices are FAL's pricing at time of writing; check [fal.ai](https://fal.ai/) for current numbers. ## Setup :::tip Nous Subscribers -If you have a paid [Nous Portal](https://portal.nousresearch.com) subscription, you can use image generation through the **[Tool Gateway](tool-gateway.md)** without a FAL API key. Run `hermes model` or `hermes tools` to enable it. +If you have a paid [Nous Portal](https://portal.nousresearch.com) subscription, you can use image generation through the **[Tool Gateway](tool-gateway.md)** without a FAL API key. Your model selection persists across both paths. + +If the managed gateway returns `HTTP 4xx` for a specific model, that model isn't yet proxied on the portal side — the agent will tell you so, with remediation steps (set `FAL_KEY` for direct access, or pick a different model). ::: ### Get a FAL API Key @@ -20,150 +37,117 @@ If you have a paid [Nous Portal](https://portal.nousresearch.com) subscription, 1. Sign up at [fal.ai](https://fal.ai/) 2. Generate an API key from your dashboard -### Configure the Key +### Configure and Pick a Model + +Run the tools command: ```bash -# Add to ~/.hermes/.env -FAL_KEY=your-fal-api-key-here +hermes tools ``` -### Install the Client Library +Navigate to **🎨 Image Generation**, pick your backend (Nous Subscription or FAL.ai), then the picker shows all supported models in a column-aligned table — arrow keys to navigate, Enter to select: -```bash -pip install fal-client +``` + Model Speed Strengths Price + fal-ai/flux-2/klein/9b <1s Fast, crisp text $0.006/MP ← currently in use + fal-ai/flux-2-pro ~6s Studio photorealism $0.03/MP + fal-ai/z-image/turbo ~2s Bilingual EN/CN, 6B $0.005/MP + ... ``` -:::info -The image generation tool is automatically available when `FAL_KEY` is set. No additional toolset configuration is needed. -::: +Your selection is saved to `config.yaml`: -## How It Works +```yaml +image_gen: + model: fal-ai/flux-2/klein/9b + use_gateway: false # true if using Nous Subscription +``` -When you ask Hermes to generate an image: +### GPT-Image Quality -1. **Generation** — Your prompt is sent to the FLUX 2 Pro model (`fal-ai/flux-2-pro`) -2. **Upscaling** — The generated image is automatically upscaled 2x using the Clarity Upscaler (`fal-ai/clarity-upscaler`) -3. **Delivery** — The upscaled image URL is returned - -If upscaling fails for any reason, the original image is returned as a fallback. +The `fal-ai/gpt-image-1.5` request quality is pinned to `medium` (~$0.034/image at 1024×1024). We don't expose the `low` / `high` tiers as a user-facing option so that Nous Portal billing stays predictable across all users — the cost spread between tiers is ~22×. If you want a cheaper GPT-Image option, pick a different model; if you want higher quality, use Klein 9B or Imagen-class models. ## Usage -Simply ask Hermes to create an image: +The agent-facing schema is intentionally minimal — the model picks up whatever you've configured: ``` Generate an image of a serene mountain landscape with cherry blossoms ``` ``` -Create a portrait of a wise old owl perched on an ancient tree branch +Create a square portrait of a wise old owl — use the typography model ``` ``` -Make me a futuristic cityscape with flying cars and neon lights +Make me a futuristic cityscape, landscape orientation ``` -## Parameters - -The `image_generate_tool` accepts these parameters: - -| Parameter | Default | Range | Description | -|-----------|---------|-------|-------------| -| `prompt` | *(required)* | — | Text description of the desired image | -| `aspect_ratio` | `"landscape"` | `landscape`, `square`, `portrait` | Image aspect ratio | -| `num_inference_steps` | `50` | 1–100 | Number of denoising steps (more = higher quality, slower) | -| `guidance_scale` | `4.5` | 0.1–20.0 | How closely to follow the prompt | -| `num_images` | `1` | 1–4 | Number of images to generate | -| `output_format` | `"png"` | `png`, `jpeg` | Image file format | -| `seed` | *(random)* | any integer | Random seed for reproducible results | - ## Aspect Ratios -The tool uses simplified aspect ratio names that map to FLUX 2 Pro image sizes: +Every model accepts the same three aspect ratios from the agent's perspective. Internally, each model's native size spec is filled in automatically: -| Aspect Ratio | Maps To | Best For | -|-------------|---------|----------| -| `landscape` | `landscape_16_9` | Wallpapers, banners, scenes | -| `square` | `square_hd` | Profile pictures, social media posts | -| `portrait` | `portrait_16_9` | Character art, phone wallpapers | +| Agent input | image_size (flux/z-image/qwen/recraft/ideogram) | aspect_ratio (nano-banana) | image_size (gpt-image) | +|---|---|---|---| +| `landscape` | `landscape_16_9` | `16:9` | `1536x1024` | +| `square` | `square_hd` | `1:1` | `1024x1024` | +| `portrait` | `portrait_16_9` | `9:16` | `1024x1536` | -:::tip -You can also use the raw FLUX 2 Pro size presets directly: `square_hd`, `square`, `portrait_4_3`, `portrait_16_9`, `landscape_4_3`, `landscape_16_9`. Custom sizes up to 2048x2048 are also supported. -::: +This translation happens in `_build_fal_payload()` — agent code never has to know about per-model schema differences. ## Automatic Upscaling -Every generated image is automatically upscaled 2x using FAL.ai's Clarity Upscaler with these settings: +Upscaling via FAL's **Clarity Upscaler** is gated per-model: + +| Model | Upscale? | Why | +|---|---|---| +| `fal-ai/flux-2-pro` | ✓ | Backward-compat (was the pre-picker default) | +| All others | ✗ | Fast models would lose their sub-second value prop; hi-res models don't need it | + +When upscaling runs, it uses these settings: | Setting | Value | -|---------|-------| -| Upscale Factor | 2x | +|---|---| +| Upscale factor | 2× | | Creativity | 0.35 | | Resemblance | 0.6 | -| Guidance Scale | 4 | -| Inference Steps | 18 | -| Positive Prompt | `"masterpiece, best quality, highres"` + your original prompt | -| Negative Prompt | `"(worst quality, low quality, normal quality:2)"` | +| Guidance scale | 4 | +| Inference steps | 18 | -The upscaler enhances detail and resolution while preserving the original composition. If the upscaler fails (network issue, rate limit), the original resolution image is returned automatically. +If upscaling fails (network issue, rate limit), the original image is returned automatically. -## Example Prompts +## How It Works Internally -Here are some effective prompts to try: - -``` -A candid street photo of a woman with a pink bob and bold eyeliner -``` - -``` -Modern architecture building with glass facade, sunset lighting -``` - -``` -Abstract art with vibrant colors and geometric patterns -``` - -``` -Portrait of a wise old owl perched on ancient tree branch -``` - -``` -Futuristic cityscape with flying cars and neon lights -``` +1. **Model resolution** — `_resolve_fal_model()` reads `image_gen.model` from `config.yaml`, falls back to the `FAL_IMAGE_MODEL` env var, then to `fal-ai/flux-2/klein/9b`. +2. **Payload building** — `_build_fal_payload()` translates your `aspect_ratio` into the model's native format (preset enum, aspect-ratio enum, or GPT literal), merges the model's default params, applies any caller overrides, then filters to the model's `supports` whitelist so unsupported keys are never sent. +3. **Submission** — `_submit_fal_request()` routes via direct FAL credentials or the managed Nous gateway. +4. **Upscaling** — runs only if the model's metadata has `upscale: True`. +5. **Delivery** — final image URL returned to the agent, which emits a `MEDIA:` tag that platform adapters convert to native media. ## Debugging -Enable debug logging for image generation: +Enable debug logging: ```bash export IMAGE_TOOLS_DEBUG=true ``` -Debug logs are saved to `./logs/image_tools_debug_.json` with details about each generation request, parameters, timing, and any errors. - -## Safety Settings - -The image generation tool runs with safety checks disabled by default (`safety_tolerance: 5`, the most permissive setting). This is configured at the code level and is not user-adjustable. +Debug logs go to `./logs/image_tools_debug_.json` with per-call details (model, parameters, timing, errors). ## Platform Delivery -Generated images are delivered differently depending on the platform: - -| Platform | Delivery method | -|----------|----------------| -| **CLI** | Image URL printed as markdown `![description](url)` — click to open in browser | -| **Telegram** | Image sent as a photo message with the prompt as caption | -| **Discord** | Image embedded in a message | -| **Slack** | Image URL in message (Slack unfurls it) | -| **WhatsApp** | Image sent as a media message | -| **Other platforms** | Image URL in plain text | - -The agent uses `MEDIA:` syntax in its response, which the platform adapter converts to the appropriate format. +| Platform | Delivery | +|---|---| +| **CLI** | Image URL printed as markdown `![](url)` — click to open | +| **Telegram** | Photo message with the prompt as caption | +| **Discord** | Embedded in a message | +| **Slack** | URL unfurled by Slack | +| **WhatsApp** | Media message | +| **Others** | URL in plain text | ## Limitations -- **Requires FAL API key** — image generation incurs API costs on your FAL.ai account -- **No image editing** — this is text-to-image only, no inpainting or img2img -- **URL-based delivery** — images are returned as temporary FAL.ai URLs, not saved locally. URLs expire after a period (typically hours) -- **Upscaling adds latency** — the automatic 2x upscale step adds processing time -- **Max 4 images per request** — `num_images` is capped at 4 +- **Requires FAL credentials** (direct `FAL_KEY` or Nous Subscription) +- **Text-to-image only** — no inpainting, img2img, or editing via this tool +- **Temporary URLs** — FAL returns hosted URLs that expire after hours/days; save locally if needed +- **Per-model constraints** — some models don't support `seed`, `num_inference_steps`, etc. The `supports` filter silently drops unsupported params; this is expected behavior diff --git a/website/docs/user-guide/features/overview.md b/website/docs/user-guide/features/overview.md index 2d26e153a..10ecb90ba 100644 --- a/website/docs/user-guide/features/overview.md +++ b/website/docs/user-guide/features/overview.md @@ -30,7 +30,7 @@ Hermes Agent includes a rich set of capabilities that extend far beyond basic ch - **[Voice Mode](voice-mode.md)** — Full voice interaction across CLI and messaging platforms. Talk to the agent using your microphone, hear spoken replies, and have live voice conversations in Discord voice channels. - **[Browser Automation](browser.md)** — Full browser automation with multiple backends: Browserbase cloud, Browser Use cloud, local Chrome via CDP, or local Chromium. Navigate websites, fill forms, and extract information. - **[Vision & Image Paste](vision.md)** — Multimodal vision support. Paste images from your clipboard into the CLI and ask the agent to analyze, describe, or work with them using any vision-capable model. -- **[Image Generation](image-generation.md)** — Generate images from text prompts using FAL.ai's FLUX 2 Pro model with automatic 2x upscaling via the Clarity Upscaler. +- **[Image Generation](image-generation.md)** — Generate images from text prompts using FAL.ai. Eight models supported (FLUX 2 Klein/Pro, GPT-Image 1.5, Nano Banana, Ideogram V3, Recraft V3, Qwen, Z-Image Turbo); pick one via `hermes tools`. - **[Voice & TTS](tts.md)** — Text-to-speech output and voice message transcription across all messaging platforms, with five provider options: Edge TTS (free), ElevenLabs, OpenAI TTS, MiniMax, and NeuTTS. ## Integrations diff --git a/website/docs/user-guide/features/tool-gateway.md b/website/docs/user-guide/features/tool-gateway.md index e53878949..b33f8e09d 100644 --- a/website/docs/user-guide/features/tool-gateway.md +++ b/website/docs/user-guide/features/tool-gateway.md @@ -18,7 +18,7 @@ The **Tool Gateway** lets paid [Nous Portal](https://portal.nousresearch.com) su | Tool | What It Does | Direct Alternative | |------|--------------|--------------------| | **Web search & extract** | Search the web and extract page content via Firecrawl | `FIRECRAWL_API_KEY`, `EXA_API_KEY`, `PARALLEL_API_KEY`, `TAVILY_API_KEY` | -| **Image generation** | Generate images via FAL (FLUX 2 Pro + upscaling) | `FAL_KEY` | +| **Image generation** | Generate images via FAL (8 models: FLUX 2 Klein/Pro, GPT-Image, Nano Banana, Ideogram, Recraft, Qwen, Z-Image) | `FAL_KEY` | | **Text-to-speech** | Convert text to speech via OpenAI TTS | `VOICE_TOOLS_OPENAI_KEY`, `ELEVENLABS_API_KEY` | | **Browser automation** | Control cloud browsers via Browser Use | `BROWSER_USE_API_KEY`, `BROWSERBASE_API_KEY` |