From 6ee046a72fde50dc064b02c86ffe67ff112546ec Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Wed, 3 Jun 2026 06:38:40 -0700
Subject: [PATCH] fix(doctor): detect + repair stale HERMES_MAX_ITERATIONS .env
 ghost shadowing config.yaml (#38222)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix(doctor): detect + repair stale HERMES_MAX_ITERATIONS .env ghost shadowing config.yaml

hermes doctor now flags when ~/.hermes/.env carries a HERMES_MAX_ITERATIONS
value that disagrees with agent.max_turns in config.yaml, and 'hermes doctor
--fix' removes the stale .env line so config.yaml is authoritative. 'hermes
config show' surfaces the same drift inline under Max turns.

The setup wizard stopped dual-writing this value, but users who edited only
config.yaml from a pre-fix install keep a .env ghost. The gateway bridge
normally overrides it at startup, but if the bridge bails on any earlier
config-parse error the ghost silently wins — config says 400 while the
gateway activity line reads N/90.

The detector reads the .env FILE directly (load_env), not get_env_value/
os.environ, since the startup bridge may already have overwritten os.environ
with the config value.

Closes #17534.

* fix(config): stop offering HERMES_MAX_ITERATIONS as an editable env var

Removes HERMES_MAX_ITERATIONS from OPTIONAL_ENV_VARS so the dashboard env
editor (PUT /api/env) and any env-var prompt no longer let a user write it
to .env — which would recreate the stale ghost that shadows config.yaml's
agent.max_turns (issue #17534). The iteration budget is configured only via
config.yaml; the env var stays a read-only backward-compat fallback in the
gateway/CLI, never a promoted write target.

Regression test asserts it is absent from OPTIONAL_ENV_VARS.
---
 hermes_cli/config.py            | 23 +++++----
 hermes_cli/doctor.py            | 57 ++++++++++++++++++++++
 tests/hermes_cli/test_config.py | 12 +++++
 tests/hermes_cli/test_doctor.py | 84 +++++++++++++++++++++++++++++++++
 4 files changed, 168 insertions(+), 8 deletions(-)

diff --git a/hermes_cli/config.py b/hermes_cli/config.py
index 4cb822b19..447d5dc85 100644
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@@ -3346,13 +3346,6 @@ OPTIONAL_ENV_VARS = {
         "password": True,
         "category": "setting",
     },
-    "HERMES_MAX_ITERATIONS": {
-        "description": "Maximum tool-calling iterations per conversation (default: 90)",
-        "prompt": "Max iterations",
-        "url": None,
-        "password": False,
-        "category": "setting",
-    },
     # HERMES_TOOL_PROGRESS and HERMES_TOOL_PROGRESS_MODE are deprecated —
     # now configured via display.tool_progress in config.yaml (off|new|all|verbose).
     # Gateway falls back to these env vars for backward compatibility.
@@ -5593,7 +5586,21 @@ def show_config():
     print()
     print(color("◆ Model", Colors.CYAN, Colors.BOLD))
     print(f"  Model:        {config.get('model', 'not set')}")
-    print(f"  Max turns:    {config.get('agent', {}).get('max_turns', DEFAULT_CONFIG['agent']['max_turns'])}")
+    _cfg_max_turns = config.get('agent', {}).get('max_turns', DEFAULT_CONFIG['agent']['max_turns'])
+    print(f"  Max turns:    {_cfg_max_turns}")
+    # Warn on stale HERMES_MAX_ITERATIONS ghost in .env that disagrees with
+    # config.yaml (issue #17534). Read the .env FILE directly so we catch the
+    # ghost even when the gateway bridge already overrode os.environ.
+    try:
+        _env_ghost = load_env().get("HERMES_MAX_ITERATIONS")
+        if _env_ghost is not None and str(_env_ghost).strip() != str(_cfg_max_turns).strip():
+            print(color(
+                f"                ⚠ .env has stale HERMES_MAX_ITERATIONS={_env_ghost} "
+                f"(run 'hermes doctor --fix' to remove)",
+                Colors.YELLOW,
+            ))
+    except Exception:
+        pass
     
     # Display
     print()
diff --git a/hermes_cli/doctor.py b/hermes_cli/doctor.py
index 4971f1fae..910130ac4 100644
--- a/hermes_cli/doctor.py
+++ b/hermes_cli/doctor.py
@@ -891,6 +891,63 @@ def run_doctor(args):
         except Exception:
             pass
 
+        # Detect stale HERMES_MAX_ITERATIONS ghost in .env shadowing
+        # agent.max_turns in config.yaml (issue #17534). The setup wizard
+        # used to dual-write the iteration budget to both stores; users who
+        # later edit only config.yaml are left with a .env ghost. The gateway
+        # bridge normally derives HERMES_MAX_ITERATIONS from agent.max_turns
+        # at startup, but if that bridge bails (any earlier config-parse
+        # error), the stale .env value silently wins and the agent runs at the
+        # wrong budget — e.g. config says 400 but the activity line reads N/90.
+        # Read the .env FILE directly (load_env), not get_env_value/os.environ,
+        # which the startup bridge may already have overridden.
+        try:
+            import yaml
+            from hermes_cli.config import load_env, remove_env_value
+            with open(config_path, encoding="utf-8") as f:
+                raw_config = yaml.safe_load(f) or {}
+            agent_cfg = raw_config.get("agent")
+            cfg_max_turns = (
+                agent_cfg.get("max_turns")
+                if isinstance(agent_cfg, dict)
+                else None
+            )
+            # Legacy root-level key counts too.
+            if cfg_max_turns is None:
+                cfg_max_turns = raw_config.get("max_turns")
+            env_ghost = load_env().get("HERMES_MAX_ITERATIONS")
+            drift = (
+                cfg_max_turns is not None
+                and env_ghost is not None
+                and str(cfg_max_turns).strip() != str(env_ghost).strip()
+            )
+            if drift:
+                check_warn(
+                    f"HERMES_MAX_ITERATIONS={env_ghost} in .env shadows "
+                    f"agent.max_turns={cfg_max_turns} in config.yaml",
+                    "(stale ghost from an earlier `hermes setup` run)",
+                )
+                if should_fix:
+                    if remove_env_value("HERMES_MAX_ITERATIONS"):
+                        check_ok(
+                            "Removed stale HERMES_MAX_ITERATIONS from .env "
+                            f"(config.yaml agent.max_turns={cfg_max_turns} is now authoritative)"
+                        )
+                        fixed_count += 1
+                    else:
+                        check_warn("Could not remove HERMES_MAX_ITERATIONS from .env")
+                        manual_issues.append(
+                            "Manually delete the HERMES_MAX_ITERATIONS line from "
+                            f"{_DHH}/.env — config.yaml agent.max_turns is authoritative."
+                        )
+                else:
+                    issues.append(
+                        "Stale HERMES_MAX_ITERATIONS in .env shadows config.yaml — "
+                        "run 'hermes doctor --fix'"
+                    )
+        except Exception:
+            pass
+
         # Validate config structure (catches malformed custom_providers, etc.)
         try:
             from hermes_cli.config import validate_config_structure
diff --git a/tests/hermes_cli/test_config.py b/tests/hermes_cli/test_config.py
index eb4827a41..ec0a6aea7 100644
--- a/tests/hermes_cli/test_config.py
+++ b/tests/hermes_cli/test_config.py
@@ -486,6 +486,18 @@ class TestOptionalEnvVarsRegistry:
             all_vars.extend(vars_list)
         assert "TAVILY_API_KEY" in all_vars
 
+    def test_max_iterations_not_offered_as_env_var(self):
+        """HERMES_MAX_ITERATIONS must NOT be in OPTIONAL_ENV_VARS (issue #17534).
+
+        Offering it as an editable env var (dashboard, `hermes setup`) lets a
+        user write it to .env, recreating the stale ghost that shadows
+        config.yaml's agent.max_turns. The iteration budget is configured ONLY
+        via config.yaml; HERMES_MAX_ITERATIONS remains a read-only backward-compat
+        fallback in the gateway/CLI, never a promoted write target.
+        """
+        from hermes_cli.config import OPTIONAL_ENV_VARS
+        assert "HERMES_MAX_ITERATIONS" not in OPTIONAL_ENV_VARS
+
 
 class TestConfigMigrationSecretPrompts:
     def test_required_secret_env_prompt_uses_masked_prompt(self, tmp_path, monkeypatch):
diff --git a/tests/hermes_cli/test_doctor.py b/tests/hermes_cli/test_doctor.py
index e1edc95af..9d832fa95 100644
--- a/tests/hermes_cli/test_doctor.py
+++ b/tests/hermes_cli/test_doctor.py
@@ -1274,3 +1274,87 @@ class TestDoctorCodexCliHintPlacement:
         minimax_idx = next(i for i, l in enumerate(lines) if "MiniMax OAuth" in l)
         assert self._hint_line() not in lines[minimax_idx - 1]
         assert minimax_idx + 1 >= len(lines) or self._hint_line() not in lines[minimax_idx + 1]
+
+
+class TestDoctorStaleMaxIterationsDrift:
+    """Regression for #17534: a stale HERMES_MAX_ITERATIONS in .env shadows
+    agent.max_turns in config.yaml. The repro symptom is config.yaml saying
+    400 while the gateway activity line reads N/90. Doctor must detect the
+    drift, and `--fix` must remove the .env ghost (config.yaml wins).
+
+    The detector reads the .env FILE directly, NOT os.environ — the gateway
+    startup bridge can already have overridden os.environ to the config value,
+    so the ghost is only visible in the file.
+    """
+
+    def _run_config_section(self, monkeypatch, tmp_path, *, fix, ghost, cfg_turns,
+                            os_environ_value=None):
+        import pathlib
+        import contextlib
+        import io
+        from argparse import Namespace
+
+        hermes_home = tmp_path / ".hermes"
+        hermes_home.mkdir(parents=True)
+        (hermes_home / "config.yaml").write_text(
+            f"agent:\n  max_turns: {cfg_turns}\n", encoding="utf-8"
+        )
+        env_lines = ["OPENAI_API_KEY=sk-test\n"]
+        if ghost is not None:
+            env_lines.append(f"HERMES_MAX_ITERATIONS={ghost}\n")
+        (hermes_home / ".env").write_text("".join(env_lines), encoding="utf-8")
+
+        monkeypatch.setattr(doctor_mod, "HERMES_HOME", hermes_home)
+        monkeypatch.setattr(doctor_mod, "get_hermes_home", lambda: hermes_home)
+        # Point the config helpers at the temp home.
+        monkeypatch.setenv("HERMES_HOME", str(hermes_home))
+        if os_environ_value is not None:
+            # Simulate the gateway bridge having already overridden os.environ.
+            monkeypatch.setenv("HERMES_MAX_ITERATIONS", str(os_environ_value))
+        else:
+            monkeypatch.delenv("HERMES_MAX_ITERATIONS", raising=False)
+
+        # Short-circuit at the Tool Availability stage — the drift check runs
+        # well before it in the Configuration Files section.
+        fake_model_tools = types.SimpleNamespace(
+            check_tool_availability=lambda *a, **kw: (_ for _ in ()).throw(SystemExit(0)),
+            TOOLSET_REQUIREMENTS={},
+        )
+        monkeypatch.setitem(sys.modules, "model_tools", fake_model_tools)
+
+        buf = io.StringIO()
+        with contextlib.redirect_stdout(buf), pytest.raises(SystemExit):
+            doctor_mod.run_doctor(Namespace(fix=fix))
+        return buf.getvalue(), hermes_home
+
+    def test_detects_drift_warn_only(self, monkeypatch, tmp_path):
+        out, hermes_home = self._run_config_section(
+            monkeypatch, tmp_path, fix=False, ghost=90, cfg_turns=400,
+            os_environ_value=400,  # bridge contaminated os.environ
+        )
+        assert "HERMES_MAX_ITERATIONS=90" in out
+        assert "shadows" in out
+        # Warn-only must NOT mutate .env.
+        assert "HERMES_MAX_ITERATIONS=90" in (hermes_home / ".env").read_text(encoding="utf-8")
+
+    def test_fix_removes_ghost(self, monkeypatch, tmp_path):
+        out, hermes_home = self._run_config_section(
+            monkeypatch, tmp_path, fix=True, ghost=90, cfg_turns=400,
+            os_environ_value=400,
+        )
+        assert "Removed stale HERMES_MAX_ITERATIONS" in out
+        env_after = (hermes_home / ".env").read_text(encoding="utf-8")
+        assert "HERMES_MAX_ITERATIONS" not in env_after
+        assert "OPENAI_API_KEY=sk-test" in env_after  # other keys preserved
+
+    def test_no_drift_when_values_match(self, monkeypatch, tmp_path):
+        out, _ = self._run_config_section(
+            monkeypatch, tmp_path, fix=False, ghost=400, cfg_turns=400,
+        )
+        assert "shadows" not in out
+
+    def test_no_drift_when_ghost_absent(self, monkeypatch, tmp_path):
+        out, _ = self._run_config_section(
+            monkeypatch, tmp_path, fix=False, ghost=None, cfg_turns=400,
+        )
+        assert "shadows" not in out