fix(gateway): use setsid instead of systemd-run --user for /update (salvage #4024) (#4104)

Salvaged from PR #4024 by @Sertug17. Fixes #4017.

- Replace systemd-run --user --scope with setsid for portable session detach
- Add system-level service detection to cmd_update gateway restart
- Falls back to start_new_session=True on systems without setsid (macOS, minimal containers)
This commit is contained in:
Teknium
2026-03-30 20:22:09 -07:00
committed by GitHub
parent fb2af3bd1d
commit 83e5249be6
4 changed files with 161 additions and 30 deletions

View File

@ -4617,8 +4617,8 @@ class GatewayRunner:
async def _handle_update_command(self, event: MessageEvent) -> str:
"""Handle /update command — update Hermes Agent to the latest version.
Spawns ``hermes update`` in a separate systemd scope so it survives the
gateway restart that ``hermes update`` may trigger at the end. Marker
Spawns ``hermes update`` in a detached session (via ``setsid``) so it
survives the gateway restart that ``hermes update`` may trigger. Marker
files are written so either the current gateway process or the next one
can notify the user when the update finishes.
"""
@ -4658,28 +4658,28 @@ class GatewayRunner:
pending_path.write_text(json.dumps(pending))
exit_code_path.unlink(missing_ok=True)
# Spawn `hermes update` in a separate cgroup so it survives gateway
# restart. systemd-run --user --scope creates a transient scope unit.
# Spawn `hermes update` detached so it survives gateway restart.
# Use setsid for portable session detach (works under system services
# where systemd-run --user fails due to missing D-Bus session).
hermes_cmd_str = " ".join(shlex.quote(part) for part in hermes_cmd)
update_cmd = (
f"{hermes_cmd_str} update > {shlex.quote(str(output_path))} 2>&1; "
f"status=$?; printf '%s' \"$status\" > {shlex.quote(str(exit_code_path))}"
)
try:
systemd_run = shutil.which("systemd-run")
if systemd_run:
setsid_bin = shutil.which("setsid")
if setsid_bin:
# Preferred: setsid creates a new session, fully detached
subprocess.Popen(
[systemd_run, "--user", "--scope",
"--unit=hermes-update", "--",
"bash", "-c", update_cmd],
[setsid_bin, "bash", "-c", update_cmd],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
start_new_session=True,
)
else:
# Fallback: best-effort detach with start_new_session
# Fallback: start_new_session=True calls os.setsid() in child
subprocess.Popen(
["bash", "-c", f"nohup {update_cmd} &"],
["bash", "-c", update_cmd],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
start_new_session=True,

View File

@ -3165,6 +3165,7 @@ def cmd_update(args):
_gw_service_name = get_service_name()
existing_pid = get_running_pid()
has_systemd_service = False
has_system_service = False
has_launchd_service = False
try:
@ -3177,6 +3178,19 @@ def cmd_update(args):
except (FileNotFoundError, subprocess.TimeoutExpired):
pass
# Also check for a system-level service (hermes gateway install --system).
# This covers gateways running under system systemd where --user
# fails due to missing D-Bus session.
if not has_systemd_service and is_linux():
try:
check = subprocess.run(
["systemctl", "is-active", _gw_service_name],
capture_output=True, text=True, timeout=5,
)
has_system_service = check.stdout.strip() == "active"
except (FileNotFoundError, subprocess.TimeoutExpired):
pass
# Check for macOS launchd service
if is_macos():
try:
@ -3191,7 +3205,7 @@ def cmd_update(args):
except (FileNotFoundError, subprocess.TimeoutExpired):
pass
if existing_pid or has_systemd_service or has_launchd_service:
if existing_pid or has_systemd_service or has_system_service or has_launchd_service:
print()
# When a service manager is handling the gateway, let it
@ -3232,6 +3246,21 @@ def cmd_update(args):
print(" hermes gateway restart")
else:
print(" Try manually: hermes gateway restart")
elif has_system_service:
# System-level service (hermes gateway install --system).
# No D-Bus session needed — systemctl without --user talks
# directly to the system manager over /run/systemd/private.
print("→ Restarting system gateway service...")
restart = subprocess.run(
["systemctl", "restart", _gw_service_name],
capture_output=True, text=True, timeout=15,
)
if restart.returncode == 0:
print("✓ Gateway restarted (system service).")
else:
print(f"⚠ Gateway restart failed: {restart.stderr.strip()}")
print(" System services may require root. Try:")
print(f" sudo systemctl restart {_gw_service_name}")
elif has_launchd_service:
# Refresh the plist first (picks up --replace and other
# changes from the update we just pulled).

View File

@ -202,7 +202,7 @@ class TestHandleUpdateCommand:
with patch("gateway.run._hermes_home", hermes_home), \
patch("gateway.run.__file__", fake_file), \
patch("shutil.which", side_effect=lambda x: "/usr/bin/hermes" if x == "hermes" else "/usr/bin/systemd-run"), \
patch("shutil.which", side_effect=lambda x: "/usr/bin/hermes" if x == "hermes" else "/usr/bin/setsid"), \
patch("subprocess.Popen"):
result = await runner._handle_update_command(event)
@ -215,8 +215,8 @@ class TestHandleUpdateCommand:
assert not (hermes_home / ".update_exit_code").exists()
@pytest.mark.asyncio
async def test_spawns_systemd_run(self, tmp_path):
"""Uses systemd-run when available."""
async def test_spawns_setsid(self, tmp_path):
"""Uses setsid when available."""
runner = _make_runner()
event = _make_event()
@ -236,16 +236,16 @@ class TestHandleUpdateCommand:
patch("subprocess.Popen", mock_popen):
result = await runner._handle_update_command(event)
# Verify systemd-run was used
# Verify setsid was used
call_args = mock_popen.call_args[0][0]
assert call_args[0] == "/usr/bin/systemd-run"
assert "--scope" in call_args
assert call_args[0] == "/usr/bin/setsid"
assert call_args[1] == "bash"
assert ".update_exit_code" in call_args[-1]
assert "Starting Hermes update" in result
@pytest.mark.asyncio
async def test_fallback_nohup_when_no_systemd_run(self, tmp_path):
"""Falls back to nohup when systemd-run is not available."""
async def test_fallback_when_no_setsid(self, tmp_path):
"""Falls back to start_new_session=True when setsid is not available."""
runner = _make_runner()
event = _make_event()
@ -260,24 +260,27 @@ class TestHandleUpdateCommand:
mock_popen = MagicMock()
def which_no_systemd(x):
def which_no_setsid(x):
if x == "hermes":
return "/usr/bin/hermes"
if x == "systemd-run":
if x == "setsid":
return None
return None
with patch("gateway.run._hermes_home", hermes_home), \
patch("gateway.run.__file__", fake_file), \
patch("shutil.which", side_effect=which_no_systemd), \
patch("shutil.which", side_effect=which_no_setsid), \
patch("subprocess.Popen", mock_popen):
result = await runner._handle_update_command(event)
# Verify bash -c nohup fallback was used
# Verify plain bash -c fallback (no nohup, no setsid)
call_args = mock_popen.call_args[0][0]
assert call_args[0] == "bash"
assert "nohup" in call_args[2]
assert "nohup" not in call_args[2]
assert ".update_exit_code" in call_args[2]
# start_new_session=True should be in kwargs
call_kwargs = mock_popen.call_args[1]
assert call_kwargs.get("start_new_session") is True
assert "Starting Hermes update" in result
@pytest.mark.asyncio

View File

@ -25,6 +25,8 @@ def _make_run_side_effect(
verify_ok=True,
commit_count="3",
systemd_active=False,
system_service_active=False,
system_restart_rc=0,
launchctl_loaded=False,
):
"""Build a subprocess.run side_effect that simulates git + service commands."""
@ -45,14 +47,23 @@ def _make_run_side_effect(
if "rev-list" in joined:
return subprocess.CompletedProcess(cmd, 0, stdout=f"{commit_count}\n", stderr="")
# systemctl --user is-active
# systemctl is-active — distinguish --user from system scope
if "systemctl" in joined and "is-active" in joined:
if systemd_active:
return subprocess.CompletedProcess(cmd, 0, stdout="active\n", stderr="")
return subprocess.CompletedProcess(cmd, 3, stdout="inactive\n", stderr="")
if "--user" in joined:
if systemd_active:
return subprocess.CompletedProcess(cmd, 0, stdout="active\n", stderr="")
return subprocess.CompletedProcess(cmd, 3, stdout="inactive\n", stderr="")
else:
# System-level check (no --user)
if system_service_active:
return subprocess.CompletedProcess(cmd, 0, stdout="active\n", stderr="")
return subprocess.CompletedProcess(cmd, 3, stdout="inactive\n", stderr="")
# systemctl --user restart
# systemctl restart — distinguish --user from system scope
if "systemctl" in joined and "restart" in joined:
if "--user" not in joined and system_service_active:
stderr = "" if system_restart_rc == 0 else "Failed to restart: Permission denied"
return subprocess.CompletedProcess(cmd, system_restart_rc, stdout="", stderr=stderr)
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
# launchctl list ai.hermes.gateway
@ -393,3 +404,91 @@ class TestCmdUpdateLaunchdRestart:
assert "Stopped gateway" not in captured
assert "Gateway restarted" not in captured
assert "Gateway restarted via launchd" not in captured
# ---------------------------------------------------------------------------
# cmd_update — system-level systemd service detection
# ---------------------------------------------------------------------------
class TestCmdUpdateSystemService:
"""cmd_update detects system-level gateway services where --user fails."""
@patch("shutil.which", return_value=None)
@patch("subprocess.run")
def test_update_detects_system_service_and_restarts(
self, mock_run, _mock_which, mock_args, capsys, monkeypatch,
):
"""When user systemd is inactive but a system service exists, restart via system scope."""
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
mock_run.side_effect = _make_run_side_effect(
commit_count="3",
systemd_active=False,
system_service_active=True,
)
with patch("gateway.status.get_running_pid", return_value=12345), \
patch("gateway.status.remove_pid_file"):
cmd_update(mock_args)
captured = capsys.readouterr().out
assert "system gateway service" in captured.lower()
assert "Gateway restarted (system service)" in captured
# Verify systemctl restart (no --user) was called
restart_calls = [
c for c in mock_run.call_args_list
if "restart" in " ".join(str(a) for a in c.args[0])
and "systemctl" in " ".join(str(a) for a in c.args[0])
and "--user" not in " ".join(str(a) for a in c.args[0])
]
assert len(restart_calls) == 1
@patch("shutil.which", return_value=None)
@patch("subprocess.run")
def test_update_system_service_restart_failure_shows_sudo_hint(
self, mock_run, _mock_which, mock_args, capsys, monkeypatch,
):
"""When system service restart fails (e.g. no root), show sudo hint."""
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
mock_run.side_effect = _make_run_side_effect(
commit_count="3",
systemd_active=False,
system_service_active=True,
system_restart_rc=1,
)
with patch("gateway.status.get_running_pid", return_value=12345), \
patch("gateway.status.remove_pid_file"):
cmd_update(mock_args)
captured = capsys.readouterr().out
assert "sudo systemctl restart" in captured
@patch("shutil.which", return_value=None)
@patch("subprocess.run")
def test_user_service_takes_priority_over_system(
self, mock_run, _mock_which, mock_args, capsys, monkeypatch,
):
"""When both user and system services are active, user wins."""
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
mock_run.side_effect = _make_run_side_effect(
commit_count="3",
systemd_active=True,
system_service_active=True,
)
with patch("gateway.status.get_running_pid", return_value=12345), \
patch("gateway.status.remove_pid_file"), \
patch("os.kill"):
cmd_update(mock_args)
captured = capsys.readouterr().out
# Should restart via user service, not system
assert "Gateway restarted." in captured
assert "(system service)" not in captured