fix(doctor): detect + repair stale HERMES_MAX_ITERATIONS .env ghost shadowing config.yaml (#38222)

* fix(doctor): detect + repair stale HERMES_MAX_ITERATIONS .env ghost shadowing config.yaml hermes doctor now flags when ~/.hermes/.env carries a HERMES_MAX_ITERATIONS value that disagrees with agent.max_turns in config.yaml, and 'hermes doctor --fix' removes the stale .env line so config.yaml is authoritative. 'hermes config show' surfaces the same drift inline under Max turns. The setup wizard stopped dual-writing this value, but users who edited only config.yaml from a pre-fix install keep a .env ghost. The gateway bridge normally overrides it at startup, but if the bridge bails on any earlier config-parse error the ghost silently wins — config says 400 while the gateway activity line reads N/90. The detector reads the .env FILE directly (load_env), not get_env_value/ os.environ, since the startup bridge may already have overwritten os.environ with the config value. Closes #17534. * fix(config): stop offering HERMES_MAX_ITERATIONS as an editable env var Removes HERMES_MAX_ITERATIONS from OPTIONAL_ENV_VARS so the dashboard env editor (PUT /api/env) and any env-var prompt no longer let a user write it to .env — which would recreate the stale ghost that shadows config.yaml's agent.max_turns (issue #17534). The iteration budget is configured only via config.yaml; the env var stays a read-only backward-compat fallback in the gateway/CLI, never a promoted write target. Regression test asserts it is absent from OPTIONAL_ENV_VARS.
2026-06-03 06:38:40 -07:00
parent de26b17854
commit 6ee046a72f
4 changed files with 168 additions and 8 deletions
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@ -3346,13 +3346,6 @@ OPTIONAL_ENV_VARS = {
        "password": True,
        "category": "setting",
    },
-    "HERMES_MAX_ITERATIONS": {
-        "description": "Maximum tool-calling iterations per conversation (default: 90)",
-        "prompt": "Max iterations",
-        "url": None,
-        "password": False,
-        "category": "setting",
-    },
    # HERMES_TOOL_PROGRESS and HERMES_TOOL_PROGRESS_MODE are deprecated —
    # now configured via display.tool_progress in config.yaml (off|new|all|verbose).
    # Gateway falls back to these env vars for backward compatibility.
@ -5593,7 +5586,21 @@ def show_config():
    print()
    print(color("◆ Model", Colors.CYAN, Colors.BOLD))
    print(f"  Model:        {config.get('model', 'not set')}")
-    print(f"  Max turns:    {config.get('agent', {}).get('max_turns', DEFAULT_CONFIG['agent']['max_turns'])}")
+    _cfg_max_turns = config.get('agent', {}).get('max_turns', DEFAULT_CONFIG['agent']['max_turns'])
+    print(f"  Max turns:    {_cfg_max_turns}")
+    # Warn on stale HERMES_MAX_ITERATIONS ghost in .env that disagrees with
+    # config.yaml (issue #17534). Read the .env FILE directly so we catch the
+    # ghost even when the gateway bridge already overrode os.environ.
+    try:
+        _env_ghost = load_env().get("HERMES_MAX_ITERATIONS")
+        if _env_ghost is not None and str(_env_ghost).strip() != str(_cfg_max_turns).strip():
+            print(color(
+                f"                ⚠ .env has stale HERMES_MAX_ITERATIONS={_env_ghost} "
+                f"(run 'hermes doctor --fix' to remove)",
+                Colors.YELLOW,
+            ))
+    except Exception:
+        pass
    
    # Display
    print()
--- a/hermes_cli/doctor.py
+++ b/hermes_cli/doctor.py
@ -891,6 +891,63 @@ def run_doctor(args):
        except Exception:
            pass

+        # Detect stale HERMES_MAX_ITERATIONS ghost in .env shadowing
+        # agent.max_turns in config.yaml (issue #17534). The setup wizard
+        # used to dual-write the iteration budget to both stores; users who
+        # later edit only config.yaml are left with a .env ghost. The gateway
+        # bridge normally derives HERMES_MAX_ITERATIONS from agent.max_turns
+        # at startup, but if that bridge bails (any earlier config-parse
+        # error), the stale .env value silently wins and the agent runs at the
+        # wrong budget — e.g. config says 400 but the activity line reads N/90.
+        # Read the .env FILE directly (load_env), not get_env_value/os.environ,
+        # which the startup bridge may already have overridden.
+        try:
+            import yaml
+            from hermes_cli.config import load_env, remove_env_value
+            with open(config_path, encoding="utf-8") as f:
+                raw_config = yaml.safe_load(f) or {}
+            agent_cfg = raw_config.get("agent")
+            cfg_max_turns = (
+                agent_cfg.get("max_turns")
+                if isinstance(agent_cfg, dict)
+                else None
+            )
+            # Legacy root-level key counts too.
+            if cfg_max_turns is None:
+                cfg_max_turns = raw_config.get("max_turns")
+            env_ghost = load_env().get("HERMES_MAX_ITERATIONS")
+            drift = (
+                cfg_max_turns is not None
+                and env_ghost is not None
+                and str(cfg_max_turns).strip() != str(env_ghost).strip()
+            )
+            if drift:
+                check_warn(
+                    f"HERMES_MAX_ITERATIONS={env_ghost} in .env shadows "
+                    f"agent.max_turns={cfg_max_turns} in config.yaml",
+                    "(stale ghost from an earlier `hermes setup` run)",
+                )
+                if should_fix:
+                    if remove_env_value("HERMES_MAX_ITERATIONS"):
+                        check_ok(
+                            "Removed stale HERMES_MAX_ITERATIONS from .env "
+                            f"(config.yaml agent.max_turns={cfg_max_turns} is now authoritative)"
+                        )
+                        fixed_count += 1
+                    else:
+                        check_warn("Could not remove HERMES_MAX_ITERATIONS from .env")
+                        manual_issues.append(
+                            "Manually delete the HERMES_MAX_ITERATIONS line from "
+                            f"{_DHH}/.env — config.yaml agent.max_turns is authoritative."
+                        )
+                else:
+                    issues.append(
+                        "Stale HERMES_MAX_ITERATIONS in .env shadows config.yaml — "
+                        "run 'hermes doctor --fix'"
+                    )
+        except Exception:
+            pass
+
        # Validate config structure (catches malformed custom_providers, etc.)
        try:
            from hermes_cli.config import validate_config_structure
--- a/tests/hermes_cli/test_config.py
+++ b/tests/hermes_cli/test_config.py
@ -486,6 +486,18 @@ class TestOptionalEnvVarsRegistry:
            all_vars.extend(vars_list)
        assert "TAVILY_API_KEY" in all_vars

+    def test_max_iterations_not_offered_as_env_var(self):
+        """HERMES_MAX_ITERATIONS must NOT be in OPTIONAL_ENV_VARS (issue #17534).
+
+        Offering it as an editable env var (dashboard, `hermes setup`) lets a
+        user write it to .env, recreating the stale ghost that shadows
+        config.yaml's agent.max_turns. The iteration budget is configured ONLY
+        via config.yaml; HERMES_MAX_ITERATIONS remains a read-only backward-compat
+        fallback in the gateway/CLI, never a promoted write target.
+        """
+        from hermes_cli.config import OPTIONAL_ENV_VARS
+        assert "HERMES_MAX_ITERATIONS" not in OPTIONAL_ENV_VARS
+

 class TestConfigMigrationSecretPrompts:
    def test_required_secret_env_prompt_uses_masked_prompt(self, tmp_path, monkeypatch):
--- a/tests/hermes_cli/test_doctor.py
+++ b/tests/hermes_cli/test_doctor.py
@ -1274,3 +1274,87 @@ class TestDoctorCodexCliHintPlacement:
        minimax_idx = next(i for i, l in enumerate(lines) if "MiniMax OAuth" in l)
        assert self._hint_line() not in lines[minimax_idx - 1]
        assert minimax_idx + 1 >= len(lines) or self._hint_line() not in lines[minimax_idx + 1]
+
+
+class TestDoctorStaleMaxIterationsDrift:
+    """Regression for #17534: a stale HERMES_MAX_ITERATIONS in .env shadows
+    agent.max_turns in config.yaml. The repro symptom is config.yaml saying
+    400 while the gateway activity line reads N/90. Doctor must detect the
+    drift, and `--fix` must remove the .env ghost (config.yaml wins).
+
+    The detector reads the .env FILE directly, NOT os.environ — the gateway
+    startup bridge can already have overridden os.environ to the config value,
+    so the ghost is only visible in the file.
+    """
+
+    def _run_config_section(self, monkeypatch, tmp_path, *, fix, ghost, cfg_turns,
+                            os_environ_value=None):
+        import pathlib
+        import contextlib
+        import io
+        from argparse import Namespace
+
+        hermes_home = tmp_path / ".hermes"
+        hermes_home.mkdir(parents=True)
+        (hermes_home / "config.yaml").write_text(
+            f"agent:\n  max_turns: {cfg_turns}\n", encoding="utf-8"
+        )
+        env_lines = ["OPENAI_API_KEY=sk-test\n"]
+        if ghost is not None:
+            env_lines.append(f"HERMES_MAX_ITERATIONS={ghost}\n")
+        (hermes_home / ".env").write_text("".join(env_lines), encoding="utf-8")
+
+        monkeypatch.setattr(doctor_mod, "HERMES_HOME", hermes_home)
+        monkeypatch.setattr(doctor_mod, "get_hermes_home", lambda: hermes_home)
+        # Point the config helpers at the temp home.
+        monkeypatch.setenv("HERMES_HOME", str(hermes_home))
+        if os_environ_value is not None:
+            # Simulate the gateway bridge having already overridden os.environ.
+            monkeypatch.setenv("HERMES_MAX_ITERATIONS", str(os_environ_value))
+        else:
+            monkeypatch.delenv("HERMES_MAX_ITERATIONS", raising=False)
+
+        # Short-circuit at the Tool Availability stage — the drift check runs
+        # well before it in the Configuration Files section.
+        fake_model_tools = types.SimpleNamespace(
+            check_tool_availability=lambda *a, **kw: (_ for _ in ()).throw(SystemExit(0)),
+            TOOLSET_REQUIREMENTS={},
+        )
+        monkeypatch.setitem(sys.modules, "model_tools", fake_model_tools)
+
+        buf = io.StringIO()
+        with contextlib.redirect_stdout(buf), pytest.raises(SystemExit):
+            doctor_mod.run_doctor(Namespace(fix=fix))
+        return buf.getvalue(), hermes_home
+
+    def test_detects_drift_warn_only(self, monkeypatch, tmp_path):
+        out, hermes_home = self._run_config_section(
+            monkeypatch, tmp_path, fix=False, ghost=90, cfg_turns=400,
+            os_environ_value=400,  # bridge contaminated os.environ
+        )
+        assert "HERMES_MAX_ITERATIONS=90" in out
+        assert "shadows" in out
+        # Warn-only must NOT mutate .env.
+        assert "HERMES_MAX_ITERATIONS=90" in (hermes_home / ".env").read_text(encoding="utf-8")
+
+    def test_fix_removes_ghost(self, monkeypatch, tmp_path):
+        out, hermes_home = self._run_config_section(
+            monkeypatch, tmp_path, fix=True, ghost=90, cfg_turns=400,
+            os_environ_value=400,
+        )
+        assert "Removed stale HERMES_MAX_ITERATIONS" in out
+        env_after = (hermes_home / ".env").read_text(encoding="utf-8")
+        assert "HERMES_MAX_ITERATIONS" not in env_after
+        assert "OPENAI_API_KEY=sk-test" in env_after  # other keys preserved
+
+    def test_no_drift_when_values_match(self, monkeypatch, tmp_path):
+        out, _ = self._run_config_section(
+            monkeypatch, tmp_path, fix=False, ghost=400, cfg_turns=400,
+        )
+        assert "shadows" not in out
+
+    def test_no_drift_when_ghost_absent(self, monkeypatch, tmp_path):
+        out, _ = self._run_config_section(
+            monkeypatch, tmp_path, fix=False, ghost=None, cfg_turns=400,
+        )
+        assert "shadows" not in out