fix(gateway): anchor service WorkingDirectory at HERMES_HOME, not the source checkout
The systemd unit (and launchd plist) pinned WorkingDirectory to PROJECT_ROOT (the checkout the unit was generated from). When that checkout is transient — a git worktree, or a clone hermes update later relocates/removes — the path rots. systemd then fails the start at the CHDIR step (status=200/CHDIR) BEFORE Python loads, so the on-boot refresh_systemd_unit_if_needed() self-heal never runs and Restart=always crash-loops forever on a dead directory. Observed in the wild: a gateway that crash-looped 153 times overnight, bot offline until a manual 'hermes gateway restart' regenerated the unit. Anchor cwd at HERMES_HOME instead — it never moves, always exists, and the gateway never needed cwd to be the checkout (ExecStart uses an absolute python + -m hermes_cli.main). Existing broken units now differ from the generated unit and self-heal on the next start/restart/update.
This commit is contained in:
@ -2161,9 +2161,37 @@ def _build_service_path_dirs(project_root: Path | None = None) -> list[str]:
|
||||
return candidates
|
||||
|
||||
|
||||
def _stable_service_working_dir() -> str:
|
||||
"""Return a WorkingDirectory that will not disappear out from under systemd.
|
||||
|
||||
The gateway does NOT need its cwd to be the source checkout — ``ExecStart``
|
||||
uses an absolute python interpreter and ``-m hermes_cli.main``, so module
|
||||
resolution does not depend on cwd. Pinning ``WorkingDirectory`` to
|
||||
``PROJECT_ROOT`` (``Path(__file__).parent.parent``) is actively harmful:
|
||||
when the unit is generated from a transient checkout — a ``.worktrees/``
|
||||
dir, or a clone that ``hermes update`` later relocates/removes — the path
|
||||
rots. systemd then fails the start at the CHDIR step (``status=200/CHDIR``,
|
||||
"Changing to the requested working directory failed") *before* Python
|
||||
loads, so the on-boot ``refresh_systemd_unit_if_needed()`` self-heal never
|
||||
runs and ``Restart=always`` crash-loops forever on a dead directory.
|
||||
|
||||
``HERMES_HOME`` is the stable anchor: it is where config/state/logs live,
|
||||
it never moves, and it is guaranteed to exist whenever the gateway is
|
||||
meaningfully installed. Fall back to ``PROJECT_ROOT`` only if HERMES_HOME
|
||||
cannot be resolved (it always can in practice).
|
||||
"""
|
||||
try:
|
||||
home = get_hermes_home()
|
||||
if home and Path(home).is_dir():
|
||||
return str(Path(home).resolve())
|
||||
except Exception:
|
||||
pass
|
||||
return str(PROJECT_ROOT)
|
||||
|
||||
|
||||
def generate_systemd_unit(system: bool = False, run_as_user: str | None = None) -> str:
|
||||
python_path = get_python_path()
|
||||
working_dir = str(PROJECT_ROOT)
|
||||
working_dir = _stable_service_working_dir()
|
||||
detected_venv = _detect_venv_dir()
|
||||
venv_dir = str(detected_venv) if detected_venv else str(PROJECT_ROOT / "venv")
|
||||
|
||||
@ -2192,7 +2220,10 @@ def generate_systemd_unit(system: bool = False, run_as_user: str | None = None)
|
||||
# (e.g. /root/) to the target user's home so the service can
|
||||
# actually access them.
|
||||
python_path = _remap_path_for_user(python_path, home_dir)
|
||||
working_dir = _remap_path_for_user(working_dir, home_dir)
|
||||
# Anchor cwd to the target user's HERMES_HOME (stable, always exists)
|
||||
# rather than a remapped source-checkout path that can rot. See
|
||||
# _stable_service_working_dir() for the full rationale.
|
||||
working_dir = str(hermes_home) if hermes_home else _remap_path_for_user(working_dir, home_dir)
|
||||
venv_dir = _remap_path_for_user(venv_dir, home_dir)
|
||||
path_entries = [_remap_path_for_user(p, home_dir) for p in path_entries]
|
||||
path_entries.extend(_build_user_local_paths(Path(home_dir), path_entries))
|
||||
@ -2804,7 +2835,10 @@ def _launchd_domain() -> str:
|
||||
|
||||
def generate_launchd_plist() -> str:
|
||||
python_path = get_python_path()
|
||||
working_dir = str(PROJECT_ROOT)
|
||||
# Stable cwd anchor — never the volatile source checkout. See
|
||||
# _stable_service_working_dir() for the rationale (same rot risk applies
|
||||
# to launchd's WorkingDirectory as to systemd's).
|
||||
working_dir = _stable_service_working_dir()
|
||||
hermes_home = str(get_hermes_home().resolve())
|
||||
log_dir = get_hermes_home() / "logs"
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
@ -2531,3 +2531,46 @@ class TestGatewayCommandCatchesSystemScopeError:
|
||||
# Renders the message, NOT the ``('msg', 'action')`` tuple repr
|
||||
assert "System gateway start requires root. Re-run with sudo." in out
|
||||
assert "('" not in out # no tuple repr leaking through
|
||||
|
||||
|
||||
class TestServiceWorkingDirIsStable:
|
||||
"""The gateway service must anchor WorkingDirectory at a stable path
|
||||
(HERMES_HOME), never the source checkout / worktree, so a relocated or
|
||||
deleted checkout can't crash-loop the unit on CHDIR (status=200).
|
||||
"""
|
||||
|
||||
def test_stable_working_dir_uses_hermes_home(self, tmp_path, monkeypatch):
|
||||
home = tmp_path / ".hermes"
|
||||
home.mkdir()
|
||||
monkeypatch.setattr(gateway_cli, "get_hermes_home", lambda: home)
|
||||
assert Path(gateway_cli._stable_service_working_dir()) == home.resolve()
|
||||
|
||||
def test_stable_working_dir_falls_back_to_project_root(self, tmp_path, monkeypatch):
|
||||
# HERMES_HOME points somewhere that does not exist -> fall back.
|
||||
missing = tmp_path / "does-not-exist" / ".hermes"
|
||||
monkeypatch.setattr(gateway_cli, "get_hermes_home", lambda: missing)
|
||||
assert gateway_cli._stable_service_working_dir() == str(gateway_cli.PROJECT_ROOT)
|
||||
|
||||
def test_user_unit_workingdirectory_is_hermes_home_not_checkout(self, tmp_path, monkeypatch):
|
||||
home = tmp_path / ".hermes"
|
||||
home.mkdir()
|
||||
monkeypatch.setattr(gateway_cli, "get_hermes_home", lambda: home)
|
||||
unit = gateway_cli.generate_systemd_unit(system=False)
|
||||
wd = [l for l in unit.splitlines() if l.startswith("WorkingDirectory=")]
|
||||
assert wd, "unit has no WorkingDirectory line"
|
||||
value = wd[0].split("=", 1)[1]
|
||||
assert Path(value).resolve() == home.resolve()
|
||||
# The bug class: never pin cwd inside a transient worktree checkout.
|
||||
assert "/.worktrees/" not in value
|
||||
|
||||
def test_launchd_workingdirectory_is_hermes_home(self, tmp_path, monkeypatch):
|
||||
import re
|
||||
|
||||
home = tmp_path / ".hermes"
|
||||
home.mkdir()
|
||||
monkeypatch.setattr(gateway_cli, "get_hermes_home", lambda: home)
|
||||
plist = gateway_cli.generate_launchd_plist()
|
||||
m = re.search(r"<key>WorkingDirectory</key>\s*<string>(.*?)</string>", plist)
|
||||
assert m, "plist has no WorkingDirectory entry"
|
||||
assert Path(m.group(1)).resolve() == home.resolve()
|
||||
assert "/.worktrees/" not in m.group(1)
|
||||
|
||||
Reference in New Issue
Block a user