fix(docker): seed s6 gateway state for legacy run cmd (#34829)

* fix(docker): seed s6 gateway state for legacy run cmd

* fix(docker): honor no-supervise during legacy gateway migration

---------

Co-authored-by: Donovan Yohan <donovan-yohan@users.noreply.github.com>
This commit is contained in:
Donovan Yohan
2026-05-31 21:28:56 -04:00
committed by GitHub
parent e1c7a9aa7b
commit dcbf62e26a
2 changed files with 161 additions and 4 deletions

View File

@ -24,7 +24,7 @@ import logging
import os import os
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import Literal from typing import Literal, Sequence
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -57,6 +57,7 @@ def reconcile_profile_gateways(
hermes_home: Path, hermes_home: Path,
scandir: Path, scandir: Path,
dry_run: bool = False, dry_run: bool = False,
container_argv: Sequence[str] | None = None,
) -> list[ReconcileAction]: ) -> list[ReconcileAction]:
"""Recreate s6 service registrations for every persistent profile. """Recreate s6 service registrations for every persistent profile.
@ -82,6 +83,8 @@ def reconcile_profile_gateways(
directories are created at ``<scandir>/gateway-<profile>/``. directories are created at ``<scandir>/gateway-<profile>/``.
dry_run: When True, walk and return the action list without dry_run: When True, walk and return the action list without
touching the filesystem. For tests and `--dry-run` debug. touching the filesystem. For tests and `--dry-run` debug.
container_argv: Optional container PID 1 argv override. Production
reads ``/proc/1/cmdline``; tests inject it directly.
Returns: Returns:
One :class:`ReconcileAction` per profile, in this order: One :class:`ReconcileAction` per profile, in this order:
@ -93,8 +96,15 @@ def reconcile_profile_gateways(
# populated the root profile dir. The slot exists so # populated the root profile dir. The slot exists so
# ``hermes gateway start`` (no ``-p``) has somewhere to land; # ``hermes gateway start`` (no ``-p``) has somewhere to land;
# auto-up only when the prior state was "running" (same rule as # auto-up only when the prior state was "running" (same rule as
# named profiles). # named profiles). If the container was launched with the legacy
default_prior_state = _read_prior_state(hermes_home) # `gateway run` command and no state exists yet, seed that intent
# as `running` so the s6 reconciler preserves the pre-s6 behavior.
legacy_default_state = _maybe_migrate_legacy_gateway_run_state(
hermes_home,
container_argv=container_argv,
dry_run=dry_run,
)
default_prior_state = legacy_default_state or _read_prior_state(hermes_home)
default_should_start = default_prior_state in _AUTOSTART_STATES default_should_start = default_prior_state in _AUTOSTART_STATES
if not dry_run: if not dry_run:
_cleanup_stale_runtime_files(hermes_home) _cleanup_stale_runtime_files(hermes_home)
@ -147,6 +157,66 @@ def reconcile_profile_gateways(
return actions return actions
def _maybe_migrate_legacy_gateway_run_state(
hermes_home: Path,
*,
container_argv: Sequence[str] | None,
dry_run: bool,
) -> str | None:
"""Seed root gateway_state for pre-s6 `gateway run` containers.
The tini image let Docker users run the gateway as the container
command (`docker run ... gateway run`). After the s6 migration,
profile gateways are restored from persisted gateway_state.json; a
legacy container with no state file would therefore register the
default service down and never start. Only synthesize state when no
root gateway_state.json exists so explicit stopped/failed states keep
winning across restarts.
"""
state_file = hermes_home / "gateway_state.json"
if state_file.exists():
return None
if os.environ.get("HERMES_GATEWAY_NO_SUPERVISE", "").lower() in ("1", "true", "yes"):
return None
argv = tuple(container_argv) if container_argv is not None else _read_container_argv()
if not _is_legacy_gateway_run_request(argv):
return None
if not dry_run:
import time
state_file.write_text(json.dumps({
"gateway_state": "running",
"timestamp": int(time.time()),
"migrated_from": "legacy-container-cmd",
}) + "\n")
return "running"
def _read_container_argv() -> tuple[str, ...]:
"""Best-effort read of the container PID 1 argv."""
try:
raw = Path("/proc/1/cmdline").read_bytes()
except OSError:
return ()
return tuple(part.decode("utf-8", "replace") for part in raw.split(b"\0") if part)
def _is_legacy_gateway_run_request(argv: Sequence[str]) -> bool:
"""Return True for Docker commands equivalent to `gateway run`."""
args = list(argv)
if args and Path(args[0]).name == "init":
args = args[1:]
if args and args[0].endswith("main-wrapper.sh"):
args = args[1:]
if args and Path(args[0]).name == "hermes":
args = args[1:]
if "--no-supervise" in args:
return False
return len(args) >= 2 and args[0] == "gateway" and args[1] == "run"
def _read_prior_state(profile_dir: Path) -> str | None: def _read_prior_state(profile_dir: Path) -> str | None:
"""Read gateway_state.json's ``gateway_state`` field, or None if """Read gateway_state.json's ``gateway_state`` field, or None if
missing or unparseable. Unparseable counts as "no prior state" so missing or unparseable. Unparseable counts as "no prior state" so

View File

@ -484,6 +484,88 @@ def test_default_slot_autostarts_when_root_state_running(tmp_path: Path) -> None
assert not (scandir / "gateway-default" / "down").exists() assert not (scandir / "gateway-default" / "down").exists()
@pytest.mark.parametrize(
"container_argv",
[
("gateway", "run"),
("/init", "/opt/hermes/docker/main-wrapper.sh", "gateway", "run"),
],
)
def test_legacy_gateway_run_cmd_seeds_default_running_state(
tmp_path: Path,
container_argv: tuple[str, ...],
) -> None:
"""Pre-s6 Docker users often ran `gateway run` as the container
command. With no persisted gateway_state.json yet, s6 reconciliation
must migrate that legacy intent into a running default gateway slot."""
scandir = tmp_path / "run-service"; scandir.mkdir()
actions = reconcile_profile_gateways(
hermes_home=tmp_path,
scandir=scandir,
dry_run=False,
container_argv=container_argv,
)
default_action = next(a for a in actions if a.profile == "default")
assert default_action.prior_state == "running"
assert default_action.action == "started"
assert not (scandir / "gateway-default" / "down").exists()
state = json.loads((tmp_path / "gateway_state.json").read_text())
assert state["gateway_state"] == "running"
assert state["migrated_from"] == "legacy-container-cmd"
@pytest.mark.parametrize(
"container_argv",
[
("gateway", "run", "--no-supervise"),
("/init", "/opt/hermes/docker/main-wrapper.sh", "gateway", "run", "--no-supervise"),
],
)
def test_legacy_gateway_run_no_supervise_does_not_seed_s6_state(
tmp_path: Path,
container_argv: tuple[str, ...],
) -> None:
"""`gateway run --no-supervise` is an explicit opt-out from s6 migration."""
scandir = tmp_path / "run-service"; scandir.mkdir()
actions = reconcile_profile_gateways(
hermes_home=tmp_path,
scandir=scandir,
dry_run=False,
container_argv=container_argv,
)
default_action = next(a for a in actions if a.profile == "default")
assert default_action.prior_state is None
assert default_action.action == "registered"
assert (scandir / "gateway-default" / "down").exists()
assert not (tmp_path / "gateway_state.json").exists()
def test_legacy_gateway_run_env_no_supervise_does_not_seed_s6_state(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Env opt-out matches the CLI `--no-supervise` flag."""
scandir = tmp_path / "run-service"; scandir.mkdir()
monkeypatch.setenv("HERMES_GATEWAY_NO_SUPERVISE", "1")
actions = reconcile_profile_gateways(
hermes_home=tmp_path,
scandir=scandir,
dry_run=False,
container_argv=("gateway", "run"),
)
default_action = next(a for a in actions if a.profile == "default")
assert default_action.prior_state is None
assert default_action.action == "registered"
assert (scandir / "gateway-default" / "down").exists()
assert not (tmp_path / "gateway_state.json").exists()
def test_default_slot_does_not_autostart_when_root_state_stopped( def test_default_slot_does_not_autostart_when_root_state_stopped(
tmp_path: Path, tmp_path: Path,
) -> None: ) -> None:
@ -491,12 +573,17 @@ def test_default_slot_does_not_autostart_when_root_state_stopped(
_seed_default_root(tmp_path, state="stopped") _seed_default_root(tmp_path, state="stopped")
actions = reconcile_profile_gateways( actions = reconcile_profile_gateways(
hermes_home=tmp_path, scandir=scandir, dry_run=False, hermes_home=tmp_path,
scandir=scandir,
dry_run=False,
container_argv=("gateway", "run"),
) )
default_action = next(a for a in actions if a.profile == "default") default_action = next(a for a in actions if a.profile == "default")
assert default_action.action == "registered" assert default_action.action == "registered"
assert (scandir / "gateway-default" / "down").exists() assert (scandir / "gateway-default" / "down").exists()
state = json.loads((tmp_path / "gateway_state.json").read_text())
assert state["gateway_state"] == "stopped"
def test_default_slot_does_not_autostart_when_root_state_startup_failed( def test_default_slot_does_not_autostart_when_root_state_startup_failed(