Inside an s6 container, `gateway run` redirects to the supervised
gateway and then keeps the CMD process alive as a no-op heartbeat so
/init doesn't start stage-3 shutdown. That heartbeat is
`os.execvp("sleep", ["sleep", "infinity"])`, which does a PATH lookup
for the `sleep` binary. When PATH was empty/truncated/clobbered at that
point — e.g. after user customizations rewrote PATH, or on a minimal
image without `sleep` on PATH — the exec raised FileNotFoundError,
killing the CMD process and causing /init to tear down every service:
the container failed to start (issue #36208, a regression in the s6
image from 2026.5.28).
Wrap the exec in try/except OSError: on success it still replaces the
process with the cheap `sleep` heartbeat (no resident Python
interpreter, and the existing process-tree/recursion contract is
preserved); on failure it falls back to `_block_until_terminated()` —
a SIGTERM handler (clean 128+signum exit on `docker stop`) plus a
signal.pause() loop, which needs no external binary and so can't fail
on PATH state. A threading.Event().wait() fallback covers platforms
without signal.pause().
Keeping execvp as the primary path (rather than replacing it outright)
preserves the `sleep infinity` heartbeat that the docker integration
tests assert (test_gateway_run_supervised.py) and avoids leaving a
full Python interpreter resident for the container's lifetime.
Verified end-to-end on a built image: with execvp forced to fail,
_block_until_terminated() blocks cleanly instead of raising
FileNotFoundError; normal boot still runs the cheap `sleep infinity`
heartbeat; the 6 test_gateway_run_supervised.py integration tests pass.
Salvages the two community fixes for this issue — the fallback design
from #36221 (@Pluviobyte) and the signal.pause() heartbeat from #36267
(@karmeleon) — and adds regression tests for both the normal and
sleep-missing paths.
Co-authored-by: Pluviobyte <Pluviobyte@users.noreply.github.com>
Co-authored-by: karmeleon <karmeleon@users.noreply.github.com>
Closes #36208.
This commit is contained in:
@ -5877,15 +5877,60 @@ def _maybe_redirect_run_to_s6_supervision(args) -> bool:
|
||||
file=sys.stderr,
|
||||
flush=True,
|
||||
)
|
||||
# Block until the container is signalled. The supervised gateway's
|
||||
# lifetime is independent of this process — s6-supervise restarts
|
||||
# it on crash, and we don't want the container to exit when the
|
||||
# gateway flaps. `sleep infinity` matches the static main-hermes
|
||||
# service's pattern (see docker/s6-rc.d/main-hermes/run): the CMD
|
||||
# process is a no-op heartbeat that keeps /init alive until
|
||||
# Keep the CMD process alive as a no-op heartbeat. The supervised
|
||||
# gateway's lifetime is independent of this process — s6-supervise
|
||||
# restarts it on crash, and we don't want the container to exit when
|
||||
# the gateway flaps. The CMD process keeps /init alive until
|
||||
# `docker stop` sends SIGTERM, at which point /init runs stage 3
|
||||
# shutdown (which tears down the supervised gateway cleanly).
|
||||
os.execvp("sleep", ["sleep", "infinity"])
|
||||
#
|
||||
# Prefer `sleep infinity` (matches the static main-hermes service's
|
||||
# pattern in docker/s6-rc.d/main-hermes/run, and frees the Python
|
||||
# interpreter — the heartbeat is a tiny `sleep` process, not a
|
||||
# resident interpreter). But `os.execvp` does a PATH lookup for the
|
||||
# `sleep` binary and historically crashed the whole container with
|
||||
# FileNotFoundError when PATH was empty/truncated/clobbered at this
|
||||
# point — e.g. after user customizations rewrote PATH, or on minimal
|
||||
# images without `sleep` on PATH (issue #36208). Fall back to an
|
||||
# in-process block (no external binary, can't fail on PATH) so the
|
||||
# container keeps running instead of dying during boot.
|
||||
try:
|
||||
os.execvp("sleep", ["sleep", "infinity"])
|
||||
except OSError:
|
||||
# execvp only returns by raising; on success it replaces this
|
||||
# process. ENOENT (no `sleep` on PATH) and any other exec error
|
||||
# land here.
|
||||
print(
|
||||
"→ `sleep` is unavailable; keeping the s6 CMD process alive "
|
||||
"in-process until the container is stopped.",
|
||||
file=sys.stderr,
|
||||
flush=True,
|
||||
)
|
||||
_block_until_terminated()
|
||||
return True # unreachable on the execvp success path
|
||||
|
||||
|
||||
def _block_until_terminated() -> None:
|
||||
"""Keep the s6 CMD process alive until the container is stopped.
|
||||
|
||||
Fallback heartbeat for when ``os.execvp("sleep", ...)`` can't run
|
||||
(``sleep`` missing from PATH — issue #36208). Installs a SIGTERM
|
||||
handler that exits with the conventional 128+signum code so
|
||||
``docker stop`` produces a clean, expected exit, then blocks on
|
||||
``signal.pause()``. Falls back to ``threading.Event().wait()`` on
|
||||
platforms without ``signal.pause()`` (e.g. Windows) — although this
|
||||
path only runs inside the s6 Linux container image, the fallback
|
||||
keeps the helper safe to import and unit-test anywhere.
|
||||
"""
|
||||
signal.signal(signal.SIGTERM, lambda signum, _frame: sys.exit(128 + signum))
|
||||
pause = getattr(signal, "pause", None)
|
||||
if pause is not None:
|
||||
while True:
|
||||
pause()
|
||||
else: # pragma: no cover - non-Unix fallback, not exercised in the s6 image
|
||||
import threading
|
||||
|
||||
threading.Event().wait()
|
||||
|
||||
|
||||
def _gateway_command_inner(args):
|
||||
|
||||
@ -361,28 +361,6 @@ def _stub_s6(monkeypatch: pytest.MonkeyPatch, *, on_s6: bool) -> _CallRecorder:
|
||||
return rec
|
||||
|
||||
|
||||
class _ExecvpCalled(BaseException):
|
||||
"""Sentinel raised by the os.execvp stub so tests can assert on it
|
||||
without actually replacing the test runner process. Inherits from
|
||||
BaseException so it bypasses generic ``except Exception`` blocks in
|
||||
the code under test (just like a real exec would)."""
|
||||
|
||||
def __init__(self, argv: list[str]) -> None:
|
||||
self.argv = argv
|
||||
|
||||
|
||||
def _stub_execvp(monkeypatch: pytest.MonkeyPatch) -> list[list[str]]:
|
||||
"""Replace os.execvp with a recorder that raises _ExecvpCalled."""
|
||||
calls: list[list[str]] = []
|
||||
|
||||
def fake_execvp(file: str, args: list[str]) -> None: # noqa: ANN401
|
||||
calls.append([file, *args])
|
||||
raise _ExecvpCalled([file, *args])
|
||||
|
||||
monkeypatch.setattr("hermes_cli.gateway.os.execvp", fake_execvp)
|
||||
return calls
|
||||
|
||||
|
||||
def test_redirect_noop_on_host(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""Host runs (non-s6) must not redirect. Returns False; caller
|
||||
continues to the foreground gateway code path unchanged."""
|
||||
@ -407,14 +385,31 @@ def test_redirect_fires_inside_s6_container(
|
||||
|
||||
1. Dispatch `start` to the service manager.
|
||||
2. Print the loud breadcrumb to stderr.
|
||||
3. exec `sleep infinity` to keep the CMD alive without binding
|
||||
container lifetime to gateway PID lifetime.
|
||||
3. exec `sleep infinity` to keep the CMD alive (the cheap heartbeat;
|
||||
no resident Python interpreter) without binding container
|
||||
lifetime to gateway PID lifetime.
|
||||
"""
|
||||
from hermes_cli import gateway as gw
|
||||
|
||||
rec = _stub_s6(monkeypatch, on_s6=True)
|
||||
monkeypatch.setattr("hermes_cli.gateway._profile_suffix", lambda: "")
|
||||
execvp_calls = _stub_execvp(monkeypatch)
|
||||
|
||||
class _ExecvpCalled(BaseException):
|
||||
def __init__(self, argv: list[str]) -> None:
|
||||
self.argv = argv
|
||||
|
||||
execvp_calls: list[list[str]] = []
|
||||
|
||||
def fake_execvp(file: str, args: list[str]) -> None:
|
||||
execvp_calls.append([file, *args])
|
||||
raise _ExecvpCalled([file, *args])
|
||||
|
||||
monkeypatch.setattr("hermes_cli.gateway.os.execvp", fake_execvp)
|
||||
# If the fallback ran, the normal sleep path was wrongly skipped.
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.gateway._block_until_terminated",
|
||||
lambda: pytest.fail("fallback should not run when sleep is available"),
|
||||
)
|
||||
monkeypatch.delenv("HERMES_S6_SUPERVISED_CHILD", raising=False)
|
||||
monkeypatch.delenv("HERMES_GATEWAY_NO_SUPERVISE", raising=False)
|
||||
|
||||
@ -428,11 +423,90 @@ def test_redirect_fires_inside_s6_container(
|
||||
assert "s6 supervision" in err
|
||||
assert "--no-supervise" in err
|
||||
assert "HERMES_GATEWAY_NO_SUPERVISE" in err
|
||||
# 3. exec'd `sleep infinity`.
|
||||
# 3. exec'd `sleep infinity` (the preferred cheap heartbeat).
|
||||
assert execvp_calls == [["sleep", "sleep", "infinity"]]
|
||||
assert excinfo.value.argv == ["sleep", "sleep", "infinity"]
|
||||
|
||||
|
||||
def test_redirect_falls_back_when_sleep_missing(
|
||||
monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str],
|
||||
) -> None:
|
||||
"""Regression guard for issue #36208: when ``os.execvp("sleep", ...)``
|
||||
raises (no `sleep` on a clobbered/empty PATH, or a minimal image
|
||||
without it), the redirect must NOT crash the container — it falls
|
||||
back to the in-process ``_block_until_terminated`` heartbeat so the
|
||||
container keeps running.
|
||||
"""
|
||||
from hermes_cli import gateway as gw
|
||||
|
||||
rec = _stub_s6(monkeypatch, on_s6=True)
|
||||
monkeypatch.setattr("hermes_cli.gateway._profile_suffix", lambda: "")
|
||||
|
||||
def missing_sleep(file: str, args: list[str]) -> None:
|
||||
raise FileNotFoundError(2, "No such file or directory", file)
|
||||
|
||||
monkeypatch.setattr("hermes_cli.gateway.os.execvp", missing_sleep)
|
||||
block_calls: list[bool] = []
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.gateway._block_until_terminated",
|
||||
lambda: block_calls.append(True),
|
||||
)
|
||||
monkeypatch.delenv("HERMES_S6_SUPERVISED_CHILD", raising=False)
|
||||
monkeypatch.delenv("HERMES_GATEWAY_NO_SUPERVISE", raising=False)
|
||||
|
||||
# Must not raise FileNotFoundError — that was the #36208 crash.
|
||||
result = gw._maybe_redirect_run_to_s6_supervision(_Args())
|
||||
|
||||
assert result is True
|
||||
assert rec.calls == [("start", "gateway-default")]
|
||||
# Fell back to the in-process heartbeat instead of crashing.
|
||||
assert block_calls == [True]
|
||||
err = capsys.readouterr().err
|
||||
assert "`sleep` is unavailable" in err
|
||||
|
||||
|
||||
def test_block_until_terminated_installs_sigterm_handler_and_blocks(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""``_block_until_terminated`` must register a SIGTERM handler (so
|
||||
`docker stop` exits cleanly) and then block on signal.pause() — never
|
||||
touching an external binary. Regression guard for issue #36208, where
|
||||
os.execvp("sleep", ...) crashed the container with FileNotFoundError
|
||||
when PATH lacked a directory containing `sleep`.
|
||||
"""
|
||||
import signal as _signal
|
||||
from hermes_cli import gateway as gw
|
||||
|
||||
registered: dict[int, object] = {}
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.gateway.signal.signal",
|
||||
lambda signum, handler: registered.__setitem__(signum, handler),
|
||||
)
|
||||
|
||||
# Make signal.pause() raise after the first call so the infinite loop
|
||||
# terminates deterministically instead of hanging the test.
|
||||
pause_calls = {"n": 0}
|
||||
|
||||
def fake_pause() -> None:
|
||||
pause_calls["n"] += 1
|
||||
raise KeyboardInterrupt # break out of the `while True: pause()` loop
|
||||
|
||||
monkeypatch.setattr("hermes_cli.gateway.signal.pause", fake_pause)
|
||||
|
||||
with pytest.raises(KeyboardInterrupt):
|
||||
gw._block_until_terminated()
|
||||
|
||||
# A SIGTERM handler was installed...
|
||||
assert _signal.SIGTERM in registered
|
||||
# ...and it exits with the conventional 128+signum code.
|
||||
handler = registered[_signal.SIGTERM]
|
||||
with pytest.raises(SystemExit) as exc:
|
||||
handler(_signal.SIGTERM, None) # type: ignore[operator]
|
||||
assert exc.value.code == 128 + _signal.SIGTERM
|
||||
# ...and we actually blocked on pause().
|
||||
assert pause_calls["n"] == 1
|
||||
|
||||
|
||||
def test_redirect_short_circuits_supervised_child(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
@ -516,10 +590,25 @@ def test_redirect_no_supervise_env_falsy_values_dont_opt_out(
|
||||
|
||||
_stub_s6(monkeypatch, on_s6=True)
|
||||
monkeypatch.setattr("hermes_cli.gateway._profile_suffix", lambda: "")
|
||||
_stub_execvp(monkeypatch)
|
||||
|
||||
# The redirect reaching its `sleep` heartbeat means it did NOT opt
|
||||
# out. Stub execvp to record + raise (so it doesn't replace the test
|
||||
# process) rather than actually exec.
|
||||
class _ExecvpCalled(BaseException):
|
||||
pass
|
||||
|
||||
execvp_calls: list[str] = []
|
||||
|
||||
def fake_execvp(file: str, args: list[str]) -> None:
|
||||
execvp_calls.append(file)
|
||||
raise _ExecvpCalled
|
||||
|
||||
monkeypatch.setattr("hermes_cli.gateway.os.execvp", fake_execvp)
|
||||
monkeypatch.delenv("HERMES_S6_SUPERVISED_CHILD", raising=False)
|
||||
|
||||
for falsy in ("", "0", "false", "no", "off", "garbage"):
|
||||
execvp_calls.clear()
|
||||
monkeypatch.setenv("HERMES_GATEWAY_NO_SUPERVISE", falsy)
|
||||
with pytest.raises(_ExecvpCalled):
|
||||
gw._maybe_redirect_run_to_s6_supervision(_Args())
|
||||
assert execvp_calls == ["sleep"], f"redirect should fire for {falsy!r}"
|
||||
|
||||
Reference in New Issue
Block a user