diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index f477019cb..05dc69e23 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -227,9 +227,8 @@ def _graceful_restart_via_sigusr1(pid: int, drain_timeout: float) -> bool: SIGUSR1 is wired in gateway/run.py to ``request_restart(via_service=True)`` which drains in-flight agent runs (up to ``agent.restart_drain_timeout`` - seconds), then exits. systemd relaunches clean exits via - ``Restart=always``; launchd still uses a non-zero planned-restart exit - because its plist has ``KeepAlive.SuccessfulExit = false``. + seconds), then exits. Both systemd (``Restart=always``) and launchd + (unconditional ``KeepAlive``) restart on any exit. This is the drain-aware alternative to ``systemctl restart`` / ``SIGTERM``, which SIGKILL in-flight agents after a short timeout. @@ -3083,10 +3082,7 @@ def generate_launchd_plist() -> str: KeepAlive - - SuccessfulExit - - + StandardOutPath {log_dir}/gateway.log @@ -3249,7 +3245,7 @@ def launchd_stop(): pass # bootout unloads the service definition so KeepAlive doesn't respawn # the process. A plain `kill SIGTERM` only signals the process — launchd - # immediately restarts it because KeepAlive.SuccessfulExit = false. + # immediately restarts it because KeepAlive is unconditionally true. # `hermes gateway start` re-bootstraps when it detects the job is unloaded. try: subprocess.run(["launchctl", "bootout", target], check=True, timeout=90) diff --git a/tests/hermes_cli/test_gateway_service.py b/tests/hermes_cli/test_gateway_service.py index c6baa7156..c529fa59a 100644 --- a/tests/hermes_cli/test_gateway_service.py +++ b/tests/hermes_cli/test_gateway_service.py @@ -2579,3 +2579,24 @@ class TestServiceWorkingDirIsStable: assert m, "plist has no WorkingDirectory entry" assert Path(m.group(1)).resolve() == home.resolve() assert "/.worktrees/" not in m.group(1) + + def test_launchd_plist_keepalive_unconditional(self, tmp_path, monkeypatch): + """KeepAlive must be unconditional so the gateway restarts on clean exits. + + Bug #37388: the old ``KeepAlive.SuccessfulExit = false`` dict form meant + launchd would NOT restart after a zero-exit (e.g. ``gateway run --replace`` + causes the old instance to exit cleanly). Switching to the scalar + ``KeepAlive`` makes launchd restart regardless of exit code. + """ + home = tmp_path / ".hermes" + home.mkdir() + monkeypatch.setattr(gateway_cli, "get_hermes_home", lambda: home) + plist = gateway_cli.generate_launchd_plist() + + # Scalar must be present immediately after the KeepAlive key + assert "KeepAlive" in plist + # The unconditional form + assert "KeepAlive\n " in plist + # The old conditional dict form must NOT appear + assert "SuccessfulExit" not in plist + assert "KeepAlive\n " not in plist