diff --git a/hermes_cli/container_boot.py b/hermes_cli/container_boot.py index 739f1e95f..4e9afe4cb 100644 --- a/hermes_cli/container_boot.py +++ b/hermes_cli/container_boot.py @@ -24,7 +24,7 @@ import logging import os from dataclasses import dataclass from pathlib import Path -from typing import Literal +from typing import Literal, Sequence log = logging.getLogger(__name__) @@ -57,6 +57,7 @@ def reconcile_profile_gateways( hermes_home: Path, scandir: Path, dry_run: bool = False, + container_argv: Sequence[str] | None = None, ) -> list[ReconcileAction]: """Recreate s6 service registrations for every persistent profile. @@ -82,6 +83,8 @@ def reconcile_profile_gateways( directories are created at ``/gateway-/``. dry_run: When True, walk and return the action list without touching the filesystem. For tests and `--dry-run` debug. + container_argv: Optional container PID 1 argv override. Production + reads ``/proc/1/cmdline``; tests inject it directly. Returns: One :class:`ReconcileAction` per profile, in this order: @@ -93,8 +96,15 @@ def reconcile_profile_gateways( # populated the root profile dir. The slot exists so # ``hermes gateway start`` (no ``-p``) has somewhere to land; # auto-up only when the prior state was "running" (same rule as - # named profiles). - default_prior_state = _read_prior_state(hermes_home) + # named profiles). If the container was launched with the legacy + # `gateway run` command and no state exists yet, seed that intent + # as `running` so the s6 reconciler preserves the pre-s6 behavior. + legacy_default_state = _maybe_migrate_legacy_gateway_run_state( + hermes_home, + container_argv=container_argv, + dry_run=dry_run, + ) + default_prior_state = legacy_default_state or _read_prior_state(hermes_home) default_should_start = default_prior_state in _AUTOSTART_STATES if not dry_run: _cleanup_stale_runtime_files(hermes_home) @@ -147,6 +157,66 @@ def reconcile_profile_gateways( return actions +def _maybe_migrate_legacy_gateway_run_state( + hermes_home: Path, + *, + container_argv: Sequence[str] | None, + dry_run: bool, +) -> str | None: + """Seed root gateway_state for pre-s6 `gateway run` containers. + + The tini image let Docker users run the gateway as the container + command (`docker run ... gateway run`). After the s6 migration, + profile gateways are restored from persisted gateway_state.json; a + legacy container with no state file would therefore register the + default service down and never start. Only synthesize state when no + root gateway_state.json exists so explicit stopped/failed states keep + winning across restarts. + """ + state_file = hermes_home / "gateway_state.json" + if state_file.exists(): + return None + + if os.environ.get("HERMES_GATEWAY_NO_SUPERVISE", "").lower() in ("1", "true", "yes"): + return None + + argv = tuple(container_argv) if container_argv is not None else _read_container_argv() + if not _is_legacy_gateway_run_request(argv): + return None + + if not dry_run: + import time + state_file.write_text(json.dumps({ + "gateway_state": "running", + "timestamp": int(time.time()), + "migrated_from": "legacy-container-cmd", + }) + "\n") + return "running" + + +def _read_container_argv() -> tuple[str, ...]: + """Best-effort read of the container PID 1 argv.""" + try: + raw = Path("/proc/1/cmdline").read_bytes() + except OSError: + return () + return tuple(part.decode("utf-8", "replace") for part in raw.split(b"\0") if part) + + +def _is_legacy_gateway_run_request(argv: Sequence[str]) -> bool: + """Return True for Docker commands equivalent to `gateway run`.""" + args = list(argv) + if args and Path(args[0]).name == "init": + args = args[1:] + if args and args[0].endswith("main-wrapper.sh"): + args = args[1:] + if args and Path(args[0]).name == "hermes": + args = args[1:] + if "--no-supervise" in args: + return False + return len(args) >= 2 and args[0] == "gateway" and args[1] == "run" + + def _read_prior_state(profile_dir: Path) -> str | None: """Read gateway_state.json's ``gateway_state`` field, or None if missing or unparseable. Unparseable counts as "no prior state" so diff --git a/tests/hermes_cli/test_container_boot.py b/tests/hermes_cli/test_container_boot.py index 58ad016f2..5af9c9f71 100644 --- a/tests/hermes_cli/test_container_boot.py +++ b/tests/hermes_cli/test_container_boot.py @@ -484,6 +484,88 @@ def test_default_slot_autostarts_when_root_state_running(tmp_path: Path) -> None assert not (scandir / "gateway-default" / "down").exists() +@pytest.mark.parametrize( + "container_argv", + [ + ("gateway", "run"), + ("/init", "/opt/hermes/docker/main-wrapper.sh", "gateway", "run"), + ], +) +def test_legacy_gateway_run_cmd_seeds_default_running_state( + tmp_path: Path, + container_argv: tuple[str, ...], +) -> None: + """Pre-s6 Docker users often ran `gateway run` as the container + command. With no persisted gateway_state.json yet, s6 reconciliation + must migrate that legacy intent into a running default gateway slot.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, + scandir=scandir, + dry_run=False, + container_argv=container_argv, + ) + + default_action = next(a for a in actions if a.profile == "default") + assert default_action.prior_state == "running" + assert default_action.action == "started" + assert not (scandir / "gateway-default" / "down").exists() + state = json.loads((tmp_path / "gateway_state.json").read_text()) + assert state["gateway_state"] == "running" + assert state["migrated_from"] == "legacy-container-cmd" + + +@pytest.mark.parametrize( + "container_argv", + [ + ("gateway", "run", "--no-supervise"), + ("/init", "/opt/hermes/docker/main-wrapper.sh", "gateway", "run", "--no-supervise"), + ], +) +def test_legacy_gateway_run_no_supervise_does_not_seed_s6_state( + tmp_path: Path, + container_argv: tuple[str, ...], +) -> None: + """`gateway run --no-supervise` is an explicit opt-out from s6 migration.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, + scandir=scandir, + dry_run=False, + container_argv=container_argv, + ) + + default_action = next(a for a in actions if a.profile == "default") + assert default_action.prior_state is None + assert default_action.action == "registered" + assert (scandir / "gateway-default" / "down").exists() + assert not (tmp_path / "gateway_state.json").exists() + + +def test_legacy_gateway_run_env_no_supervise_does_not_seed_s6_state( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Env opt-out matches the CLI `--no-supervise` flag.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + monkeypatch.setenv("HERMES_GATEWAY_NO_SUPERVISE", "1") + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, + scandir=scandir, + dry_run=False, + container_argv=("gateway", "run"), + ) + + default_action = next(a for a in actions if a.profile == "default") + assert default_action.prior_state is None + assert default_action.action == "registered" + assert (scandir / "gateway-default" / "down").exists() + assert not (tmp_path / "gateway_state.json").exists() + + def test_default_slot_does_not_autostart_when_root_state_stopped( tmp_path: Path, ) -> None: @@ -491,12 +573,17 @@ def test_default_slot_does_not_autostart_when_root_state_stopped( _seed_default_root(tmp_path, state="stopped") actions = reconcile_profile_gateways( - hermes_home=tmp_path, scandir=scandir, dry_run=False, + hermes_home=tmp_path, + scandir=scandir, + dry_run=False, + container_argv=("gateway", "run"), ) default_action = next(a for a in actions if a.profile == "default") assert default_action.action == "registered" assert (scandir / "gateway-default" / "down").exists() + state = json.loads((tmp_path / "gateway_state.json").read_text()) + assert state["gateway_state"] == "stopped" def test_default_slot_does_not_autostart_when_root_state_startup_failed(