fix(doctor): detect + repair stale HERMES_MAX_ITERATIONS .env ghost shadowing config.yaml (#38222)

* fix(doctor): detect + repair stale HERMES_MAX_ITERATIONS .env ghost shadowing config.yaml

hermes doctor now flags when ~/.hermes/.env carries a HERMES_MAX_ITERATIONS
value that disagrees with agent.max_turns in config.yaml, and 'hermes doctor
--fix' removes the stale .env line so config.yaml is authoritative. 'hermes
config show' surfaces the same drift inline under Max turns.

The setup wizard stopped dual-writing this value, but users who edited only
config.yaml from a pre-fix install keep a .env ghost. The gateway bridge
normally overrides it at startup, but if the bridge bails on any earlier
config-parse error the ghost silently wins — config says 400 while the
gateway activity line reads N/90.

The detector reads the .env FILE directly (load_env), not get_env_value/
os.environ, since the startup bridge may already have overwritten os.environ
with the config value.

Closes #17534.

* fix(config): stop offering HERMES_MAX_ITERATIONS as an editable env var

Removes HERMES_MAX_ITERATIONS from OPTIONAL_ENV_VARS so the dashboard env
editor (PUT /api/env) and any env-var prompt no longer let a user write it
to .env — which would recreate the stale ghost that shadows config.yaml's
agent.max_turns (issue #17534). The iteration budget is configured only via
config.yaml; the env var stays a read-only backward-compat fallback in the
gateway/CLI, never a promoted write target.

Regression test asserts it is absent from OPTIONAL_ENV_VARS.
This commit is contained in:
Teknium
2026-06-03 06:38:40 -07:00
committed by GitHub
parent de26b17854
commit 6ee046a72f
4 changed files with 168 additions and 8 deletions

View File

@ -3346,13 +3346,6 @@ OPTIONAL_ENV_VARS = {
"password": True,
"category": "setting",
},
"HERMES_MAX_ITERATIONS": {
"description": "Maximum tool-calling iterations per conversation (default: 90)",
"prompt": "Max iterations",
"url": None,
"password": False,
"category": "setting",
},
# HERMES_TOOL_PROGRESS and HERMES_TOOL_PROGRESS_MODE are deprecated —
# now configured via display.tool_progress in config.yaml (off|new|all|verbose).
# Gateway falls back to these env vars for backward compatibility.
@ -5593,7 +5586,21 @@ def show_config():
print()
print(color("◆ Model", Colors.CYAN, Colors.BOLD))
print(f" Model: {config.get('model', 'not set')}")
print(f" Max turns: {config.get('agent', {}).get('max_turns', DEFAULT_CONFIG['agent']['max_turns'])}")
_cfg_max_turns = config.get('agent', {}).get('max_turns', DEFAULT_CONFIG['agent']['max_turns'])
print(f" Max turns: {_cfg_max_turns}")
# Warn on stale HERMES_MAX_ITERATIONS ghost in .env that disagrees with
# config.yaml (issue #17534). Read the .env FILE directly so we catch the
# ghost even when the gateway bridge already overrode os.environ.
try:
_env_ghost = load_env().get("HERMES_MAX_ITERATIONS")
if _env_ghost is not None and str(_env_ghost).strip() != str(_cfg_max_turns).strip():
print(color(
f" ⚠ .env has stale HERMES_MAX_ITERATIONS={_env_ghost} "
f"(run 'hermes doctor --fix' to remove)",
Colors.YELLOW,
))
except Exception:
pass
# Display
print()

View File

@ -891,6 +891,63 @@ def run_doctor(args):
except Exception:
pass
# Detect stale HERMES_MAX_ITERATIONS ghost in .env shadowing
# agent.max_turns in config.yaml (issue #17534). The setup wizard
# used to dual-write the iteration budget to both stores; users who
# later edit only config.yaml are left with a .env ghost. The gateway
# bridge normally derives HERMES_MAX_ITERATIONS from agent.max_turns
# at startup, but if that bridge bails (any earlier config-parse
# error), the stale .env value silently wins and the agent runs at the
# wrong budget — e.g. config says 400 but the activity line reads N/90.
# Read the .env FILE directly (load_env), not get_env_value/os.environ,
# which the startup bridge may already have overridden.
try:
import yaml
from hermes_cli.config import load_env, remove_env_value
with open(config_path, encoding="utf-8") as f:
raw_config = yaml.safe_load(f) or {}
agent_cfg = raw_config.get("agent")
cfg_max_turns = (
agent_cfg.get("max_turns")
if isinstance(agent_cfg, dict)
else None
)
# Legacy root-level key counts too.
if cfg_max_turns is None:
cfg_max_turns = raw_config.get("max_turns")
env_ghost = load_env().get("HERMES_MAX_ITERATIONS")
drift = (
cfg_max_turns is not None
and env_ghost is not None
and str(cfg_max_turns).strip() != str(env_ghost).strip()
)
if drift:
check_warn(
f"HERMES_MAX_ITERATIONS={env_ghost} in .env shadows "
f"agent.max_turns={cfg_max_turns} in config.yaml",
"(stale ghost from an earlier `hermes setup` run)",
)
if should_fix:
if remove_env_value("HERMES_MAX_ITERATIONS"):
check_ok(
"Removed stale HERMES_MAX_ITERATIONS from .env "
f"(config.yaml agent.max_turns={cfg_max_turns} is now authoritative)"
)
fixed_count += 1
else:
check_warn("Could not remove HERMES_MAX_ITERATIONS from .env")
manual_issues.append(
"Manually delete the HERMES_MAX_ITERATIONS line from "
f"{_DHH}/.env — config.yaml agent.max_turns is authoritative."
)
else:
issues.append(
"Stale HERMES_MAX_ITERATIONS in .env shadows config.yaml — "
"run 'hermes doctor --fix'"
)
except Exception:
pass
# Validate config structure (catches malformed custom_providers, etc.)
try:
from hermes_cli.config import validate_config_structure

View File

@ -486,6 +486,18 @@ class TestOptionalEnvVarsRegistry:
all_vars.extend(vars_list)
assert "TAVILY_API_KEY" in all_vars
def test_max_iterations_not_offered_as_env_var(self):
"""HERMES_MAX_ITERATIONS must NOT be in OPTIONAL_ENV_VARS (issue #17534).
Offering it as an editable env var (dashboard, `hermes setup`) lets a
user write it to .env, recreating the stale ghost that shadows
config.yaml's agent.max_turns. The iteration budget is configured ONLY
via config.yaml; HERMES_MAX_ITERATIONS remains a read-only backward-compat
fallback in the gateway/CLI, never a promoted write target.
"""
from hermes_cli.config import OPTIONAL_ENV_VARS
assert "HERMES_MAX_ITERATIONS" not in OPTIONAL_ENV_VARS
class TestConfigMigrationSecretPrompts:
def test_required_secret_env_prompt_uses_masked_prompt(self, tmp_path, monkeypatch):

View File

@ -1274,3 +1274,87 @@ class TestDoctorCodexCliHintPlacement:
minimax_idx = next(i for i, l in enumerate(lines) if "MiniMax OAuth" in l)
assert self._hint_line() not in lines[minimax_idx - 1]
assert minimax_idx + 1 >= len(lines) or self._hint_line() not in lines[minimax_idx + 1]
class TestDoctorStaleMaxIterationsDrift:
"""Regression for #17534: a stale HERMES_MAX_ITERATIONS in .env shadows
agent.max_turns in config.yaml. The repro symptom is config.yaml saying
400 while the gateway activity line reads N/90. Doctor must detect the
drift, and `--fix` must remove the .env ghost (config.yaml wins).
The detector reads the .env FILE directly, NOT os.environ — the gateway
startup bridge can already have overridden os.environ to the config value,
so the ghost is only visible in the file.
"""
def _run_config_section(self, monkeypatch, tmp_path, *, fix, ghost, cfg_turns,
os_environ_value=None):
import pathlib
import contextlib
import io
from argparse import Namespace
hermes_home = tmp_path / ".hermes"
hermes_home.mkdir(parents=True)
(hermes_home / "config.yaml").write_text(
f"agent:\n max_turns: {cfg_turns}\n", encoding="utf-8"
)
env_lines = ["OPENAI_API_KEY=sk-test\n"]
if ghost is not None:
env_lines.append(f"HERMES_MAX_ITERATIONS={ghost}\n")
(hermes_home / ".env").write_text("".join(env_lines), encoding="utf-8")
monkeypatch.setattr(doctor_mod, "HERMES_HOME", hermes_home)
monkeypatch.setattr(doctor_mod, "get_hermes_home", lambda: hermes_home)
# Point the config helpers at the temp home.
monkeypatch.setenv("HERMES_HOME", str(hermes_home))
if os_environ_value is not None:
# Simulate the gateway bridge having already overridden os.environ.
monkeypatch.setenv("HERMES_MAX_ITERATIONS", str(os_environ_value))
else:
monkeypatch.delenv("HERMES_MAX_ITERATIONS", raising=False)
# Short-circuit at the Tool Availability stage — the drift check runs
# well before it in the Configuration Files section.
fake_model_tools = types.SimpleNamespace(
check_tool_availability=lambda *a, **kw: (_ for _ in ()).throw(SystemExit(0)),
TOOLSET_REQUIREMENTS={},
)
monkeypatch.setitem(sys.modules, "model_tools", fake_model_tools)
buf = io.StringIO()
with contextlib.redirect_stdout(buf), pytest.raises(SystemExit):
doctor_mod.run_doctor(Namespace(fix=fix))
return buf.getvalue(), hermes_home
def test_detects_drift_warn_only(self, monkeypatch, tmp_path):
out, hermes_home = self._run_config_section(
monkeypatch, tmp_path, fix=False, ghost=90, cfg_turns=400,
os_environ_value=400, # bridge contaminated os.environ
)
assert "HERMES_MAX_ITERATIONS=90" in out
assert "shadows" in out
# Warn-only must NOT mutate .env.
assert "HERMES_MAX_ITERATIONS=90" in (hermes_home / ".env").read_text(encoding="utf-8")
def test_fix_removes_ghost(self, monkeypatch, tmp_path):
out, hermes_home = self._run_config_section(
monkeypatch, tmp_path, fix=True, ghost=90, cfg_turns=400,
os_environ_value=400,
)
assert "Removed stale HERMES_MAX_ITERATIONS" in out
env_after = (hermes_home / ".env").read_text(encoding="utf-8")
assert "HERMES_MAX_ITERATIONS" not in env_after
assert "OPENAI_API_KEY=sk-test" in env_after # other keys preserved
def test_no_drift_when_values_match(self, monkeypatch, tmp_path):
out, _ = self._run_config_section(
monkeypatch, tmp_path, fix=False, ghost=400, cfg_turns=400,
)
assert "shadows" not in out
def test_no_drift_when_ghost_absent(self, monkeypatch, tmp_path):
out, _ = self._run_config_section(
monkeypatch, tmp_path, fix=False, ghost=None, cfg_turns=400,
)
assert "shadows" not in out