fix(docker): seed s6 gateway state for legacy run cmd (#34829)

* fix(docker): seed s6 gateway state for legacy run cmd

* fix(docker): honor no-supervise during legacy gateway migration

---------

Co-authored-by: Donovan Yohan <donovan-yohan@users.noreply.github.com>
This commit is contained in:
Donovan Yohan
2026-05-31 21:28:56 -04:00
committed by GitHub
parent e1c7a9aa7b
commit dcbf62e26a
2 changed files with 161 additions and 4 deletions

View File

@ -24,7 +24,7 @@ import logging
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Literal
from typing import Literal, Sequence
log = logging.getLogger(__name__)
@ -57,6 +57,7 @@ def reconcile_profile_gateways(
hermes_home: Path,
scandir: Path,
dry_run: bool = False,
container_argv: Sequence[str] | None = None,
) -> list[ReconcileAction]:
"""Recreate s6 service registrations for every persistent profile.
@ -82,6 +83,8 @@ def reconcile_profile_gateways(
directories are created at ``<scandir>/gateway-<profile>/``.
dry_run: When True, walk and return the action list without
touching the filesystem. For tests and `--dry-run` debug.
container_argv: Optional container PID 1 argv override. Production
reads ``/proc/1/cmdline``; tests inject it directly.
Returns:
One :class:`ReconcileAction` per profile, in this order:
@ -93,8 +96,15 @@ def reconcile_profile_gateways(
# populated the root profile dir. The slot exists so
# ``hermes gateway start`` (no ``-p``) has somewhere to land;
# auto-up only when the prior state was "running" (same rule as
# named profiles).
default_prior_state = _read_prior_state(hermes_home)
# named profiles). If the container was launched with the legacy
# `gateway run` command and no state exists yet, seed that intent
# as `running` so the s6 reconciler preserves the pre-s6 behavior.
legacy_default_state = _maybe_migrate_legacy_gateway_run_state(
hermes_home,
container_argv=container_argv,
dry_run=dry_run,
)
default_prior_state = legacy_default_state or _read_prior_state(hermes_home)
default_should_start = default_prior_state in _AUTOSTART_STATES
if not dry_run:
_cleanup_stale_runtime_files(hermes_home)
@ -147,6 +157,66 @@ def reconcile_profile_gateways(
return actions
def _maybe_migrate_legacy_gateway_run_state(
hermes_home: Path,
*,
container_argv: Sequence[str] | None,
dry_run: bool,
) -> str | None:
"""Seed root gateway_state for pre-s6 `gateway run` containers.
The tini image let Docker users run the gateway as the container
command (`docker run ... gateway run`). After the s6 migration,
profile gateways are restored from persisted gateway_state.json; a
legacy container with no state file would therefore register the
default service down and never start. Only synthesize state when no
root gateway_state.json exists so explicit stopped/failed states keep
winning across restarts.
"""
state_file = hermes_home / "gateway_state.json"
if state_file.exists():
return None
if os.environ.get("HERMES_GATEWAY_NO_SUPERVISE", "").lower() in ("1", "true", "yes"):
return None
argv = tuple(container_argv) if container_argv is not None else _read_container_argv()
if not _is_legacy_gateway_run_request(argv):
return None
if not dry_run:
import time
state_file.write_text(json.dumps({
"gateway_state": "running",
"timestamp": int(time.time()),
"migrated_from": "legacy-container-cmd",
}) + "\n")
return "running"
def _read_container_argv() -> tuple[str, ...]:
"""Best-effort read of the container PID 1 argv."""
try:
raw = Path("/proc/1/cmdline").read_bytes()
except OSError:
return ()
return tuple(part.decode("utf-8", "replace") for part in raw.split(b"\0") if part)
def _is_legacy_gateway_run_request(argv: Sequence[str]) -> bool:
"""Return True for Docker commands equivalent to `gateway run`."""
args = list(argv)
if args and Path(args[0]).name == "init":
args = args[1:]
if args and args[0].endswith("main-wrapper.sh"):
args = args[1:]
if args and Path(args[0]).name == "hermes":
args = args[1:]
if "--no-supervise" in args:
return False
return len(args) >= 2 and args[0] == "gateway" and args[1] == "run"
def _read_prior_state(profile_dir: Path) -> str | None:
"""Read gateway_state.json's ``gateway_state`` field, or None if
missing or unparseable. Unparseable counts as "no prior state" so

View File

@ -484,6 +484,88 @@ def test_default_slot_autostarts_when_root_state_running(tmp_path: Path) -> None
assert not (scandir / "gateway-default" / "down").exists()
@pytest.mark.parametrize(
"container_argv",
[
("gateway", "run"),
("/init", "/opt/hermes/docker/main-wrapper.sh", "gateway", "run"),
],
)
def test_legacy_gateway_run_cmd_seeds_default_running_state(
tmp_path: Path,
container_argv: tuple[str, ...],
) -> None:
"""Pre-s6 Docker users often ran `gateway run` as the container
command. With no persisted gateway_state.json yet, s6 reconciliation
must migrate that legacy intent into a running default gateway slot."""
scandir = tmp_path / "run-service"; scandir.mkdir()
actions = reconcile_profile_gateways(
hermes_home=tmp_path,
scandir=scandir,
dry_run=False,
container_argv=container_argv,
)
default_action = next(a for a in actions if a.profile == "default")
assert default_action.prior_state == "running"
assert default_action.action == "started"
assert not (scandir / "gateway-default" / "down").exists()
state = json.loads((tmp_path / "gateway_state.json").read_text())
assert state["gateway_state"] == "running"
assert state["migrated_from"] == "legacy-container-cmd"
@pytest.mark.parametrize(
"container_argv",
[
("gateway", "run", "--no-supervise"),
("/init", "/opt/hermes/docker/main-wrapper.sh", "gateway", "run", "--no-supervise"),
],
)
def test_legacy_gateway_run_no_supervise_does_not_seed_s6_state(
tmp_path: Path,
container_argv: tuple[str, ...],
) -> None:
"""`gateway run --no-supervise` is an explicit opt-out from s6 migration."""
scandir = tmp_path / "run-service"; scandir.mkdir()
actions = reconcile_profile_gateways(
hermes_home=tmp_path,
scandir=scandir,
dry_run=False,
container_argv=container_argv,
)
default_action = next(a for a in actions if a.profile == "default")
assert default_action.prior_state is None
assert default_action.action == "registered"
assert (scandir / "gateway-default" / "down").exists()
assert not (tmp_path / "gateway_state.json").exists()
def test_legacy_gateway_run_env_no_supervise_does_not_seed_s6_state(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Env opt-out matches the CLI `--no-supervise` flag."""
scandir = tmp_path / "run-service"; scandir.mkdir()
monkeypatch.setenv("HERMES_GATEWAY_NO_SUPERVISE", "1")
actions = reconcile_profile_gateways(
hermes_home=tmp_path,
scandir=scandir,
dry_run=False,
container_argv=("gateway", "run"),
)
default_action = next(a for a in actions if a.profile == "default")
assert default_action.prior_state is None
assert default_action.action == "registered"
assert (scandir / "gateway-default" / "down").exists()
assert not (tmp_path / "gateway_state.json").exists()
def test_default_slot_does_not_autostart_when_root_state_stopped(
tmp_path: Path,
) -> None:
@ -491,12 +573,17 @@ def test_default_slot_does_not_autostart_when_root_state_stopped(
_seed_default_root(tmp_path, state="stopped")
actions = reconcile_profile_gateways(
hermes_home=tmp_path, scandir=scandir, dry_run=False,
hermes_home=tmp_path,
scandir=scandir,
dry_run=False,
container_argv=("gateway", "run"),
)
default_action = next(a for a in actions if a.profile == "default")
assert default_action.action == "registered"
assert (scandir / "gateway-default" / "down").exists()
state = json.loads((tmp_path / "gateway_state.json").read_text())
assert state["gateway_state"] == "stopped"
def test_default_slot_does_not_autostart_when_root_state_startup_failed(