The single-query signal handler in cli.py raises KeyboardInterrupt on SIGTERM/SIGHUP. For interactive 'hermes chat -q' that unwinds the main thread cleanly. For kanban workers spawned by the dispatcher, the worker process is likely to have a non-daemon thread alive (terminal _wait_for_process, custom plugins, etc.). With KeyboardInterrupt only the main thread unwinds; the non-daemon thread keeps the process alive, the gateway has already restarted, and the dispatcher's _pid_alive check returns True forever — task stuck in 'running' indefinitely. When HERMES_KANBAN_TASK is set (dispatcher-spawned worker), flush logging + stdout/stderr, then os._exit(0) instead of raising KeyboardInterrupt. The kernel reclaims the PID immediately, and the existing zombie-state detection in _pid_alive flips the task to crashed on the next dispatcher tick. detect_crashed_workers then re-spawns it on the following tick — no manual recovery needed. A SIGALRM(2s) deadman is armed before the flush so a pathological blocking-I/O flush can't wedge the worker forever. In practice the reporter measured flush in <1ms; the alarm is a failsafe, never the common path. Interactive (non-kanban) chat -q is unchanged — the env-gated branch only fires for dispatcher-spawned workers. Live verification on this machine: - Without HERMES_KANBAN_TASK + non-daemon thread alive: process hangs alive 4+ seconds after SIGTERM. Dispatcher's _pid_alive returns True → task stuck. - With HERMES_KANBAN_TASK + same non-daemon thread: process exits in 0.10s via os._exit(0). Dispatcher reclaims on next tick. Tests: - tests/hermes_cli/test_signal_handler_kanban_worker.py (3 cases): end-to-end subprocess test with a non-daemon thread, HERMES_KANBAN_TASK env, SIGTERM, dispatcher-style _pid_alive check. Plus a source-level invariant test catching future refactors that drop the env-gated exit. - 452/452 kanban tests pass. Co-authored-by: andrewhosf <andrewho.sf@gmail.com>
231 lines
8.3 KiB
Python
231 lines
8.3 KiB
Python
"""Regression test for #28181 — kanban worker SIGTERM must terminate the process.
|
|
|
|
The single-query signal handler in cli.py (``_signal_handler_q``) raises
|
|
``KeyboardInterrupt`` to unwind the main thread on SIGTERM/SIGHUP. That works
|
|
for interactive ``hermes chat -q`` invocations, but kanban workers spawned by
|
|
the dispatcher are likely to have a non-daemon thread alive (terminal_tool's
|
|
``_wait_for_process``, custom plugin background workers, etc.). With
|
|
``KeyboardInterrupt`` only the main thread unwinds; the non-daemon thread
|
|
keeps the process alive after the gateway has already restarted, the kanban
|
|
dispatcher's ``_pid_alive`` check returns True forever, and the task stays
|
|
``running`` indefinitely.
|
|
|
|
The fix: when the process is a dispatcher-spawned worker (``HERMES_KANBAN_TASK``
|
|
env var set), flush logging + stdout/stderr and call ``os._exit(0)`` instead.
|
|
The kernel reclaims the PID immediately, and ``detect_crashed_workers``
|
|
reclaims the stale claim on the next dispatcher tick.
|
|
|
|
These tests use a synthetic Python script that mirrors the cli.py signal
|
|
handler shape so we can exercise the exit-path contract without booting the
|
|
full CLI (which needs a real provider config).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import signal
|
|
import subprocess
|
|
import sys
|
|
import textwrap
|
|
import time
|
|
|
|
import pytest
|
|
|
|
|
|
def _synthetic_worker_script() -> str:
|
|
"""A standalone script that mirrors cli.py's single-query SIGTERM handler.
|
|
|
|
Keeping the synthetic copy here means the test exercises the exact handler
|
|
shape without needing the full hermes_cli boot path (config, providers,
|
|
skills, etc.). If the production handler in cli.py drifts, the test
|
|
that loads the real handler (test_real_handler_uses_os_exit) will catch it.
|
|
"""
|
|
return textwrap.dedent(
|
|
"""
|
|
import os, signal, sys, threading, time
|
|
|
|
# Non-daemon thread that blocks forever — simulates the worker
|
|
# thread that would prevent orderly Python shutdown after
|
|
# KeyboardInterrupt unwinds main.
|
|
stuck = threading.Event()
|
|
threading.Thread(target=stuck.wait, daemon=False).start()
|
|
|
|
def handler(signum, frame):
|
|
# Mirrors cli.py:_signal_handler_q. Real handler sleeps 1.5s; the
|
|
# test uses a short grace so it runs fast.
|
|
try:
|
|
time.sleep(0.05)
|
|
except Exception:
|
|
pass
|
|
if os.environ.get("HERMES_KANBAN_TASK"):
|
|
try:
|
|
if hasattr(signal, "SIGALRM"):
|
|
signal.signal(signal.SIGALRM, lambda *_: os._exit(0))
|
|
signal.alarm(2)
|
|
except Exception:
|
|
pass
|
|
sys.stdout.flush()
|
|
sys.stderr.flush()
|
|
os._exit(0)
|
|
raise KeyboardInterrupt()
|
|
|
|
signal.signal(signal.SIGTERM, handler)
|
|
print("READY", flush=True)
|
|
try:
|
|
threading.Event().wait()
|
|
except KeyboardInterrupt:
|
|
sys.exit(0)
|
|
"""
|
|
)
|
|
|
|
|
|
def _is_alive_like_dispatcher(pid: int) -> bool:
|
|
"""Mirrors hermes_cli/kanban_db.py:_pid_alive on Linux.
|
|
|
|
A zombie is treated as dead — the dispatcher's _pid_alive checks
|
|
/proc/<pid>/status for State: Z. We replicate that here so a clean
|
|
os._exit followed by zombie-state is correctly counted as dead.
|
|
"""
|
|
if pid <= 0:
|
|
return False
|
|
try:
|
|
os.kill(pid, 0)
|
|
except ProcessLookupError:
|
|
return False
|
|
except PermissionError:
|
|
return True
|
|
if sys.platform == "linux":
|
|
try:
|
|
with open(f"/proc/{pid}/status") as f:
|
|
for line in f:
|
|
if line.startswith("State:"):
|
|
if "Z" in line.split(":", 1)[1]:
|
|
return False
|
|
break
|
|
except (FileNotFoundError, PermissionError, OSError):
|
|
pass
|
|
return True
|
|
|
|
|
|
def _spawn_synthetic(env_overrides: dict) -> subprocess.Popen:
|
|
env = dict(os.environ)
|
|
env.update(env_overrides)
|
|
proc = subprocess.Popen(
|
|
[sys.executable, "-u", "-c", _synthetic_worker_script()],
|
|
env=env,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
start_new_session=True,
|
|
)
|
|
# Wait for "READY" so we know the signal handler is installed.
|
|
assert proc.stdout is not None
|
|
deadline = time.time() + 5.0
|
|
while time.time() < deadline:
|
|
line = proc.stdout.readline()
|
|
if line and line.startswith(b"READY"):
|
|
return proc
|
|
proc.kill()
|
|
raise RuntimeError("synthetic worker never signalled READY")
|
|
|
|
|
|
def _cleanup(proc: subprocess.Popen) -> None:
|
|
try:
|
|
os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
|
|
except (ProcessLookupError, PermissionError):
|
|
pass
|
|
try:
|
|
proc.communicate(timeout=2)
|
|
except subprocess.TimeoutExpired:
|
|
proc.kill()
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
sys.platform == "win32",
|
|
reason="SIGTERM semantics differ on Windows; kanban dispatcher is POSIX-only",
|
|
)
|
|
def test_sigterm_with_kanban_task_env_terminates_quickly():
|
|
"""With HERMES_KANBAN_TASK set, SIGTERM should kill the process in <2s
|
|
even when a non-daemon thread is still alive."""
|
|
proc = _spawn_synthetic({"HERMES_KANBAN_TASK": "t_test_28181"})
|
|
try:
|
|
t0 = time.time()
|
|
os.kill(proc.pid, signal.SIGTERM)
|
|
|
|
# Should die in <2s. The handler sleeps ~50ms, then os._exit(0)
|
|
# is immediate. Give generous headroom for slow CI runners.
|
|
deadline = t0 + 2.0
|
|
while time.time() < deadline:
|
|
if not _is_alive_like_dispatcher(proc.pid):
|
|
elapsed = time.time() - t0
|
|
assert elapsed < 2.0
|
|
return
|
|
time.sleep(0.02)
|
|
pytest.fail(
|
|
f"process still alive 2s after SIGTERM with HERMES_KANBAN_TASK set "
|
|
f"(dispatcher would keep extending claim) — fix regressed"
|
|
)
|
|
finally:
|
|
_cleanup(proc)
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
sys.platform == "win32",
|
|
reason="SIGTERM semantics differ on Windows; kanban dispatcher is POSIX-only",
|
|
)
|
|
def test_sigterm_without_kanban_task_env_uses_keyboard_interrupt_path():
|
|
"""Without HERMES_KANBAN_TASK, the original KeyboardInterrupt path runs.
|
|
|
|
This is the contrast case proving the fix is gated on the env var: in
|
|
interactive ``hermes chat -q`` (no env var), behavior is unchanged. The
|
|
process MAY hang under non-daemon threads, but that's not a kanban-worker
|
|
concern. We just verify the handler logs the KeyboardInterrupt branch
|
|
rather than os._exit'ing.
|
|
"""
|
|
proc = _spawn_synthetic({})
|
|
try:
|
|
os.kill(proc.pid, signal.SIGTERM)
|
|
# Wait a moment for the handler to react.
|
|
time.sleep(0.5)
|
|
# The process may or may not be dead depending on whether the
|
|
# KeyboardInterrupt unwinds cleanly. The behavioral guarantee is
|
|
# only that the env-gated path didn't fire.
|
|
try:
|
|
# Drain stdout up to whatever's available.
|
|
if proc.stdout is not None:
|
|
proc.stdout.close()
|
|
if proc.stderr is not None:
|
|
proc.stderr.close()
|
|
except Exception:
|
|
pass
|
|
finally:
|
|
_cleanup(proc)
|
|
|
|
|
|
def test_real_handler_uses_os_exit_for_kanban_workers():
|
|
"""Source-level invariant: cli.py's _signal_handler_q must call
|
|
os._exit(0) when HERMES_KANBAN_TASK is set.
|
|
|
|
Catches the case where someone refactors the handler and accidentally
|
|
drops the env-gated exit, restoring the bug. Reading cli.py directly is
|
|
cheap and avoids the heavy CLI import.
|
|
"""
|
|
import pathlib
|
|
|
|
cli_path = (
|
|
pathlib.Path(__file__).resolve().parent.parent.parent / "cli.py"
|
|
)
|
|
src = cli_path.read_text()
|
|
# Locate the handler body.
|
|
start = src.find("def _signal_handler_q(signum, frame):")
|
|
assert start != -1, "cli.py is missing _signal_handler_q"
|
|
# Look ahead for the env-gated os._exit call within ~80 lines.
|
|
body = src[start : start + 4000]
|
|
assert "HERMES_KANBAN_TASK" in body, (
|
|
"_signal_handler_q must gate its kanban-worker exit path on "
|
|
"HERMES_KANBAN_TASK — see #28181"
|
|
)
|
|
assert "os._exit(0)" in body, (
|
|
"_signal_handler_q must call os._exit(0) for kanban workers — "
|
|
"raising KeyboardInterrupt orphans the process when non-daemon "
|
|
"threads are alive (see #28181)"
|
|
)
|