fix(gateway): anchor service WorkingDirectory at HERMES_HOME, not the source checkout

The systemd unit (and launchd plist) pinned WorkingDirectory to PROJECT_ROOT
(the checkout the unit was generated from). When that checkout is transient —
a git worktree, or a clone hermes update later relocates/removes — the path
rots. systemd then fails the start at the CHDIR step (status=200/CHDIR) BEFORE
Python loads, so the on-boot refresh_systemd_unit_if_needed() self-heal never
runs and Restart=always crash-loops forever on a dead directory. Observed in
the wild: a gateway that crash-looped 153 times overnight, bot offline until a
manual 'hermes gateway restart' regenerated the unit.

Anchor cwd at HERMES_HOME instead — it never moves, always exists, and the
gateway never needed cwd to be the checkout (ExecStart uses an absolute python
+ -m hermes_cli.main). Existing broken units now differ from the generated unit
and self-heal on the next start/restart/update.
This commit is contained in:
teknium1
2026-05-29 12:17:24 -07:00
committed by Teknium
parent 45b00bb49a
commit a1cb5fa2c7
2 changed files with 80 additions and 3 deletions

View File

@ -2161,9 +2161,37 @@ def _build_service_path_dirs(project_root: Path | None = None) -> list[str]:
return candidates
def _stable_service_working_dir() -> str:
"""Return a WorkingDirectory that will not disappear out from under systemd.
The gateway does NOT need its cwd to be the source checkout — ``ExecStart``
uses an absolute python interpreter and ``-m hermes_cli.main``, so module
resolution does not depend on cwd. Pinning ``WorkingDirectory`` to
``PROJECT_ROOT`` (``Path(__file__).parent.parent``) is actively harmful:
when the unit is generated from a transient checkout — a ``.worktrees/``
dir, or a clone that ``hermes update`` later relocates/removes — the path
rots. systemd then fails the start at the CHDIR step (``status=200/CHDIR``,
"Changing to the requested working directory failed") *before* Python
loads, so the on-boot ``refresh_systemd_unit_if_needed()`` self-heal never
runs and ``Restart=always`` crash-loops forever on a dead directory.
``HERMES_HOME`` is the stable anchor: it is where config/state/logs live,
it never moves, and it is guaranteed to exist whenever the gateway is
meaningfully installed. Fall back to ``PROJECT_ROOT`` only if HERMES_HOME
cannot be resolved (it always can in practice).
"""
try:
home = get_hermes_home()
if home and Path(home).is_dir():
return str(Path(home).resolve())
except Exception:
pass
return str(PROJECT_ROOT)
def generate_systemd_unit(system: bool = False, run_as_user: str | None = None) -> str:
python_path = get_python_path()
working_dir = str(PROJECT_ROOT)
working_dir = _stable_service_working_dir()
detected_venv = _detect_venv_dir()
venv_dir = str(detected_venv) if detected_venv else str(PROJECT_ROOT / "venv")
@ -2192,7 +2220,10 @@ def generate_systemd_unit(system: bool = False, run_as_user: str | None = None)
# (e.g. /root/) to the target user's home so the service can
# actually access them.
python_path = _remap_path_for_user(python_path, home_dir)
working_dir = _remap_path_for_user(working_dir, home_dir)
# Anchor cwd to the target user's HERMES_HOME (stable, always exists)
# rather than a remapped source-checkout path that can rot. See
# _stable_service_working_dir() for the full rationale.
working_dir = str(hermes_home) if hermes_home else _remap_path_for_user(working_dir, home_dir)
venv_dir = _remap_path_for_user(venv_dir, home_dir)
path_entries = [_remap_path_for_user(p, home_dir) for p in path_entries]
path_entries.extend(_build_user_local_paths(Path(home_dir), path_entries))
@ -2804,7 +2835,10 @@ def _launchd_domain() -> str:
def generate_launchd_plist() -> str:
python_path = get_python_path()
working_dir = str(PROJECT_ROOT)
# Stable cwd anchor — never the volatile source checkout. See
# _stable_service_working_dir() for the rationale (same rot risk applies
# to launchd's WorkingDirectory as to systemd's).
working_dir = _stable_service_working_dir()
hermes_home = str(get_hermes_home().resolve())
log_dir = get_hermes_home() / "logs"
log_dir.mkdir(parents=True, exist_ok=True)

View File

@ -2531,3 +2531,46 @@ class TestGatewayCommandCatchesSystemScopeError:
# Renders the message, NOT the ``('msg', 'action')`` tuple repr
assert "System gateway start requires root. Re-run with sudo." in out
assert "('" not in out # no tuple repr leaking through
class TestServiceWorkingDirIsStable:
"""The gateway service must anchor WorkingDirectory at a stable path
(HERMES_HOME), never the source checkout / worktree, so a relocated or
deleted checkout can't crash-loop the unit on CHDIR (status=200).
"""
def test_stable_working_dir_uses_hermes_home(self, tmp_path, monkeypatch):
home = tmp_path / ".hermes"
home.mkdir()
monkeypatch.setattr(gateway_cli, "get_hermes_home", lambda: home)
assert Path(gateway_cli._stable_service_working_dir()) == home.resolve()
def test_stable_working_dir_falls_back_to_project_root(self, tmp_path, monkeypatch):
# HERMES_HOME points somewhere that does not exist -> fall back.
missing = tmp_path / "does-not-exist" / ".hermes"
monkeypatch.setattr(gateway_cli, "get_hermes_home", lambda: missing)
assert gateway_cli._stable_service_working_dir() == str(gateway_cli.PROJECT_ROOT)
def test_user_unit_workingdirectory_is_hermes_home_not_checkout(self, tmp_path, monkeypatch):
home = tmp_path / ".hermes"
home.mkdir()
monkeypatch.setattr(gateway_cli, "get_hermes_home", lambda: home)
unit = gateway_cli.generate_systemd_unit(system=False)
wd = [l for l in unit.splitlines() if l.startswith("WorkingDirectory=")]
assert wd, "unit has no WorkingDirectory line"
value = wd[0].split("=", 1)[1]
assert Path(value).resolve() == home.resolve()
# The bug class: never pin cwd inside a transient worktree checkout.
assert "/.worktrees/" not in value
def test_launchd_workingdirectory_is_hermes_home(self, tmp_path, monkeypatch):
import re
home = tmp_path / ".hermes"
home.mkdir()
monkeypatch.setattr(gateway_cli, "get_hermes_home", lambda: home)
plist = gateway_cli.generate_launchd_plist()
m = re.search(r"<key>WorkingDirectory</key>\s*<string>(.*?)</string>", plist)
assert m, "plist has no WorkingDirectory entry"
assert Path(m.group(1)).resolve() == home.resolve()
assert "/.worktrees/" not in m.group(1)