From a1cb5fa2c7cd5239a4909261888453d97086986d Mon Sep 17 00:00:00 2001
From: teknium1 <127238744+teknium1@users.noreply.github.com>
Date: Fri, 29 May 2026 12:17:24 -0700
Subject: [PATCH] fix(gateway): anchor service WorkingDirectory at HERMES_HOME,
not the source checkout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The systemd unit (and launchd plist) pinned WorkingDirectory to PROJECT_ROOT
(the checkout the unit was generated from). When that checkout is transient —
a git worktree, or a clone hermes update later relocates/removes — the path
rots. systemd then fails the start at the CHDIR step (status=200/CHDIR) BEFORE
Python loads, so the on-boot refresh_systemd_unit_if_needed() self-heal never
runs and Restart=always crash-loops forever on a dead directory. Observed in
the wild: a gateway that crash-looped 153 times overnight, bot offline until a
manual 'hermes gateway restart' regenerated the unit.
Anchor cwd at HERMES_HOME instead — it never moves, always exists, and the
gateway never needed cwd to be the checkout (ExecStart uses an absolute python
+ -m hermes_cli.main). Existing broken units now differ from the generated unit
and self-heal on the next start/restart/update.
---
hermes_cli/gateway.py | 40 ++++++++++++++++++++--
tests/hermes_cli/test_gateway_service.py | 43 ++++++++++++++++++++++++
2 files changed, 80 insertions(+), 3 deletions(-)
diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py
index e90f5f9cc..68eec04a1 100644
--- a/hermes_cli/gateway.py
+++ b/hermes_cli/gateway.py
@@ -2161,9 +2161,37 @@ def _build_service_path_dirs(project_root: Path | None = None) -> list[str]:
return candidates
+def _stable_service_working_dir() -> str:
+ """Return a WorkingDirectory that will not disappear out from under systemd.
+
+ The gateway does NOT need its cwd to be the source checkout — ``ExecStart``
+ uses an absolute python interpreter and ``-m hermes_cli.main``, so module
+ resolution does not depend on cwd. Pinning ``WorkingDirectory`` to
+ ``PROJECT_ROOT`` (``Path(__file__).parent.parent``) is actively harmful:
+ when the unit is generated from a transient checkout — a ``.worktrees/``
+ dir, or a clone that ``hermes update`` later relocates/removes — the path
+ rots. systemd then fails the start at the CHDIR step (``status=200/CHDIR``,
+ "Changing to the requested working directory failed") *before* Python
+ loads, so the on-boot ``refresh_systemd_unit_if_needed()`` self-heal never
+ runs and ``Restart=always`` crash-loops forever on a dead directory.
+
+ ``HERMES_HOME`` is the stable anchor: it is where config/state/logs live,
+ it never moves, and it is guaranteed to exist whenever the gateway is
+ meaningfully installed. Fall back to ``PROJECT_ROOT`` only if HERMES_HOME
+ cannot be resolved (it always can in practice).
+ """
+ try:
+ home = get_hermes_home()
+ if home and Path(home).is_dir():
+ return str(Path(home).resolve())
+ except Exception:
+ pass
+ return str(PROJECT_ROOT)
+
+
def generate_systemd_unit(system: bool = False, run_as_user: str | None = None) -> str:
python_path = get_python_path()
- working_dir = str(PROJECT_ROOT)
+ working_dir = _stable_service_working_dir()
detected_venv = _detect_venv_dir()
venv_dir = str(detected_venv) if detected_venv else str(PROJECT_ROOT / "venv")
@@ -2192,7 +2220,10 @@ def generate_systemd_unit(system: bool = False, run_as_user: str | None = None)
# (e.g. /root/) to the target user's home so the service can
# actually access them.
python_path = _remap_path_for_user(python_path, home_dir)
- working_dir = _remap_path_for_user(working_dir, home_dir)
+ # Anchor cwd to the target user's HERMES_HOME (stable, always exists)
+ # rather than a remapped source-checkout path that can rot. See
+ # _stable_service_working_dir() for the full rationale.
+ working_dir = str(hermes_home) if hermes_home else _remap_path_for_user(working_dir, home_dir)
venv_dir = _remap_path_for_user(venv_dir, home_dir)
path_entries = [_remap_path_for_user(p, home_dir) for p in path_entries]
path_entries.extend(_build_user_local_paths(Path(home_dir), path_entries))
@@ -2804,7 +2835,10 @@ def _launchd_domain() -> str:
def generate_launchd_plist() -> str:
python_path = get_python_path()
- working_dir = str(PROJECT_ROOT)
+ # Stable cwd anchor — never the volatile source checkout. See
+ # _stable_service_working_dir() for the rationale (same rot risk applies
+ # to launchd's WorkingDirectory as to systemd's).
+ working_dir = _stable_service_working_dir()
hermes_home = str(get_hermes_home().resolve())
log_dir = get_hermes_home() / "logs"
log_dir.mkdir(parents=True, exist_ok=True)
diff --git a/tests/hermes_cli/test_gateway_service.py b/tests/hermes_cli/test_gateway_service.py
index a8f65123d..4259e87d9 100644
--- a/tests/hermes_cli/test_gateway_service.py
+++ b/tests/hermes_cli/test_gateway_service.py
@@ -2531,3 +2531,46 @@ class TestGatewayCommandCatchesSystemScopeError:
# Renders the message, NOT the ``('msg', 'action')`` tuple repr
assert "System gateway start requires root. Re-run with sudo." in out
assert "('" not in out # no tuple repr leaking through
+
+
+class TestServiceWorkingDirIsStable:
+ """The gateway service must anchor WorkingDirectory at a stable path
+ (HERMES_HOME), never the source checkout / worktree, so a relocated or
+ deleted checkout can't crash-loop the unit on CHDIR (status=200).
+ """
+
+ def test_stable_working_dir_uses_hermes_home(self, tmp_path, monkeypatch):
+ home = tmp_path / ".hermes"
+ home.mkdir()
+ monkeypatch.setattr(gateway_cli, "get_hermes_home", lambda: home)
+ assert Path(gateway_cli._stable_service_working_dir()) == home.resolve()
+
+ def test_stable_working_dir_falls_back_to_project_root(self, tmp_path, monkeypatch):
+ # HERMES_HOME points somewhere that does not exist -> fall back.
+ missing = tmp_path / "does-not-exist" / ".hermes"
+ monkeypatch.setattr(gateway_cli, "get_hermes_home", lambda: missing)
+ assert gateway_cli._stable_service_working_dir() == str(gateway_cli.PROJECT_ROOT)
+
+ def test_user_unit_workingdirectory_is_hermes_home_not_checkout(self, tmp_path, monkeypatch):
+ home = tmp_path / ".hermes"
+ home.mkdir()
+ monkeypatch.setattr(gateway_cli, "get_hermes_home", lambda: home)
+ unit = gateway_cli.generate_systemd_unit(system=False)
+ wd = [l for l in unit.splitlines() if l.startswith("WorkingDirectory=")]
+ assert wd, "unit has no WorkingDirectory line"
+ value = wd[0].split("=", 1)[1]
+ assert Path(value).resolve() == home.resolve()
+ # The bug class: never pin cwd inside a transient worktree checkout.
+ assert "/.worktrees/" not in value
+
+ def test_launchd_workingdirectory_is_hermes_home(self, tmp_path, monkeypatch):
+ import re
+
+ home = tmp_path / ".hermes"
+ home.mkdir()
+ monkeypatch.setattr(gateway_cli, "get_hermes_home", lambda: home)
+ plist = gateway_cli.generate_launchd_plist()
+ m = re.search(r"WorkingDirectory\s*(.*?)", plist)
+ assert m, "plist has no WorkingDirectory entry"
+ assert Path(m.group(1)).resolve() == home.resolve()
+ assert "/.worktrees/" not in m.group(1)