fix(cli): launchd KeepAlive unconditional restart (#37388)

Replace KeepAlive.SuccessfulExit=false dict with <key>KeepAlive</key><true/>
so launchd restarts hermes-gateway on any exit, matching the documented
drain-then-exit restart protocol used by --graceful-restart.
This commit is contained in:
ashishpatel26
2026-06-04 12:22:06 +05:30
committed by Teknium
parent 153fe28474
commit c9b62061d4
2 changed files with 25 additions and 8 deletions

View File

@ -227,9 +227,8 @@ def _graceful_restart_via_sigusr1(pid: int, drain_timeout: float) -> bool:
SIGUSR1 is wired in gateway/run.py to ``request_restart(via_service=True)``
which drains in-flight agent runs (up to ``agent.restart_drain_timeout``
seconds), then exits. systemd relaunches clean exits via
``Restart=always``; launchd still uses a non-zero planned-restart exit
because its plist has ``KeepAlive.SuccessfulExit = false``.
seconds), then exits. Both systemd (``Restart=always``) and launchd
(unconditional ``<key>KeepAlive</key><true/>``) restart on any exit.
This is the drain-aware alternative to ``systemctl restart`` / ``SIGTERM``,
which SIGKILL in-flight agents after a short timeout.
@ -3083,10 +3082,7 @@ def generate_launchd_plist() -> str:
<true/>
<key>KeepAlive</key>
<dict>
<key>SuccessfulExit</key>
<false/>
</dict>
<true/>
<key>StandardOutPath</key>
<string>{log_dir}/gateway.log</string>
@ -3249,7 +3245,7 @@ def launchd_stop():
pass
# bootout unloads the service definition so KeepAlive doesn't respawn
# the process. A plain `kill SIGTERM` only signals the process — launchd
# immediately restarts it because KeepAlive.SuccessfulExit = false.
# immediately restarts it because KeepAlive is unconditionally true.
# `hermes gateway start` re-bootstraps when it detects the job is unloaded.
try:
subprocess.run(["launchctl", "bootout", target], check=True, timeout=90)

View File

@ -2579,3 +2579,24 @@ class TestServiceWorkingDirIsStable:
assert m, "plist has no WorkingDirectory entry"
assert Path(m.group(1)).resolve() == home.resolve()
assert "/.worktrees/" not in m.group(1)
def test_launchd_plist_keepalive_unconditional(self, tmp_path, monkeypatch):
"""KeepAlive must be unconditional <true/> so the gateway restarts on clean exits.
Bug #37388: the old ``KeepAlive.SuccessfulExit = false`` dict form meant
launchd would NOT restart after a zero-exit (e.g. ``gateway run --replace``
causes the old instance to exit cleanly). Switching to the scalar
``<key>KeepAlive</key><true/>`` makes launchd restart regardless of exit code.
"""
home = tmp_path / ".hermes"
home.mkdir()
monkeypatch.setattr(gateway_cli, "get_hermes_home", lambda: home)
plist = gateway_cli.generate_launchd_plist()
# Scalar <true/> must be present immediately after the KeepAlive key
assert "<key>KeepAlive</key>" in plist
# The unconditional form
assert "<key>KeepAlive</key>\n <true/>" in plist
# The old conditional dict form must NOT appear
assert "SuccessfulExit" not in plist
assert "<key>KeepAlive</key>\n <dict>" not in plist