From a1cb5fa2c7cd5239a4909261888453d97086986d Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Fri, 29 May 2026 12:17:24 -0700 Subject: [PATCH] fix(gateway): anchor service WorkingDirectory at HERMES_HOME, not the source checkout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The systemd unit (and launchd plist) pinned WorkingDirectory to PROJECT_ROOT (the checkout the unit was generated from). When that checkout is transient — a git worktree, or a clone hermes update later relocates/removes — the path rots. systemd then fails the start at the CHDIR step (status=200/CHDIR) BEFORE Python loads, so the on-boot refresh_systemd_unit_if_needed() self-heal never runs and Restart=always crash-loops forever on a dead directory. Observed in the wild: a gateway that crash-looped 153 times overnight, bot offline until a manual 'hermes gateway restart' regenerated the unit. Anchor cwd at HERMES_HOME instead — it never moves, always exists, and the gateway never needed cwd to be the checkout (ExecStart uses an absolute python + -m hermes_cli.main). Existing broken units now differ from the generated unit and self-heal on the next start/restart/update. --- hermes_cli/gateway.py | 40 ++++++++++++++++++++-- tests/hermes_cli/test_gateway_service.py | 43 ++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 3 deletions(-) diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index e90f5f9cc..68eec04a1 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -2161,9 +2161,37 @@ def _build_service_path_dirs(project_root: Path | None = None) -> list[str]: return candidates +def _stable_service_working_dir() -> str: + """Return a WorkingDirectory that will not disappear out from under systemd. + + The gateway does NOT need its cwd to be the source checkout — ``ExecStart`` + uses an absolute python interpreter and ``-m hermes_cli.main``, so module + resolution does not depend on cwd. Pinning ``WorkingDirectory`` to + ``PROJECT_ROOT`` (``Path(__file__).parent.parent``) is actively harmful: + when the unit is generated from a transient checkout — a ``.worktrees/`` + dir, or a clone that ``hermes update`` later relocates/removes — the path + rots. systemd then fails the start at the CHDIR step (``status=200/CHDIR``, + "Changing to the requested working directory failed") *before* Python + loads, so the on-boot ``refresh_systemd_unit_if_needed()`` self-heal never + runs and ``Restart=always`` crash-loops forever on a dead directory. + + ``HERMES_HOME`` is the stable anchor: it is where config/state/logs live, + it never moves, and it is guaranteed to exist whenever the gateway is + meaningfully installed. Fall back to ``PROJECT_ROOT`` only if HERMES_HOME + cannot be resolved (it always can in practice). + """ + try: + home = get_hermes_home() + if home and Path(home).is_dir(): + return str(Path(home).resolve()) + except Exception: + pass + return str(PROJECT_ROOT) + + def generate_systemd_unit(system: bool = False, run_as_user: str | None = None) -> str: python_path = get_python_path() - working_dir = str(PROJECT_ROOT) + working_dir = _stable_service_working_dir() detected_venv = _detect_venv_dir() venv_dir = str(detected_venv) if detected_venv else str(PROJECT_ROOT / "venv") @@ -2192,7 +2220,10 @@ def generate_systemd_unit(system: bool = False, run_as_user: str | None = None) # (e.g. /root/) to the target user's home so the service can # actually access them. python_path = _remap_path_for_user(python_path, home_dir) - working_dir = _remap_path_for_user(working_dir, home_dir) + # Anchor cwd to the target user's HERMES_HOME (stable, always exists) + # rather than a remapped source-checkout path that can rot. See + # _stable_service_working_dir() for the full rationale. + working_dir = str(hermes_home) if hermes_home else _remap_path_for_user(working_dir, home_dir) venv_dir = _remap_path_for_user(venv_dir, home_dir) path_entries = [_remap_path_for_user(p, home_dir) for p in path_entries] path_entries.extend(_build_user_local_paths(Path(home_dir), path_entries)) @@ -2804,7 +2835,10 @@ def _launchd_domain() -> str: def generate_launchd_plist() -> str: python_path = get_python_path() - working_dir = str(PROJECT_ROOT) + # Stable cwd anchor — never the volatile source checkout. See + # _stable_service_working_dir() for the rationale (same rot risk applies + # to launchd's WorkingDirectory as to systemd's). + working_dir = _stable_service_working_dir() hermes_home = str(get_hermes_home().resolve()) log_dir = get_hermes_home() / "logs" log_dir.mkdir(parents=True, exist_ok=True) diff --git a/tests/hermes_cli/test_gateway_service.py b/tests/hermes_cli/test_gateway_service.py index a8f65123d..4259e87d9 100644 --- a/tests/hermes_cli/test_gateway_service.py +++ b/tests/hermes_cli/test_gateway_service.py @@ -2531,3 +2531,46 @@ class TestGatewayCommandCatchesSystemScopeError: # Renders the message, NOT the ``('msg', 'action')`` tuple repr assert "System gateway start requires root. Re-run with sudo." in out assert "('" not in out # no tuple repr leaking through + + +class TestServiceWorkingDirIsStable: + """The gateway service must anchor WorkingDirectory at a stable path + (HERMES_HOME), never the source checkout / worktree, so a relocated or + deleted checkout can't crash-loop the unit on CHDIR (status=200). + """ + + def test_stable_working_dir_uses_hermes_home(self, tmp_path, monkeypatch): + home = tmp_path / ".hermes" + home.mkdir() + monkeypatch.setattr(gateway_cli, "get_hermes_home", lambda: home) + assert Path(gateway_cli._stable_service_working_dir()) == home.resolve() + + def test_stable_working_dir_falls_back_to_project_root(self, tmp_path, monkeypatch): + # HERMES_HOME points somewhere that does not exist -> fall back. + missing = tmp_path / "does-not-exist" / ".hermes" + monkeypatch.setattr(gateway_cli, "get_hermes_home", lambda: missing) + assert gateway_cli._stable_service_working_dir() == str(gateway_cli.PROJECT_ROOT) + + def test_user_unit_workingdirectory_is_hermes_home_not_checkout(self, tmp_path, monkeypatch): + home = tmp_path / ".hermes" + home.mkdir() + monkeypatch.setattr(gateway_cli, "get_hermes_home", lambda: home) + unit = gateway_cli.generate_systemd_unit(system=False) + wd = [l for l in unit.splitlines() if l.startswith("WorkingDirectory=")] + assert wd, "unit has no WorkingDirectory line" + value = wd[0].split("=", 1)[1] + assert Path(value).resolve() == home.resolve() + # The bug class: never pin cwd inside a transient worktree checkout. + assert "/.worktrees/" not in value + + def test_launchd_workingdirectory_is_hermes_home(self, tmp_path, monkeypatch): + import re + + home = tmp_path / ".hermes" + home.mkdir() + monkeypatch.setattr(gateway_cli, "get_hermes_home", lambda: home) + plist = gateway_cli.generate_launchd_plist() + m = re.search(r"WorkingDirectory\s*(.*?)", plist) + assert m, "plist has no WorkingDirectory entry" + assert Path(m.group(1)).resolve() == home.resolve() + assert "/.worktrees/" not in m.group(1)