fix(docker): seed s6 gateway state for legacy run cmd (#34829)
* fix(docker): seed s6 gateway state for legacy run cmd * fix(docker): honor no-supervise during legacy gateway migration --------- Co-authored-by: Donovan Yohan <donovan-yohan@users.noreply.github.com>
This commit is contained in:
@ -24,7 +24,7 @@ import logging
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
from typing import Literal, Sequence
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
@ -57,6 +57,7 @@ def reconcile_profile_gateways(
|
||||
hermes_home: Path,
|
||||
scandir: Path,
|
||||
dry_run: bool = False,
|
||||
container_argv: Sequence[str] | None = None,
|
||||
) -> list[ReconcileAction]:
|
||||
"""Recreate s6 service registrations for every persistent profile.
|
||||
|
||||
@ -82,6 +83,8 @@ def reconcile_profile_gateways(
|
||||
directories are created at ``<scandir>/gateway-<profile>/``.
|
||||
dry_run: When True, walk and return the action list without
|
||||
touching the filesystem. For tests and `--dry-run` debug.
|
||||
container_argv: Optional container PID 1 argv override. Production
|
||||
reads ``/proc/1/cmdline``; tests inject it directly.
|
||||
|
||||
Returns:
|
||||
One :class:`ReconcileAction` per profile, in this order:
|
||||
@ -93,8 +96,15 @@ def reconcile_profile_gateways(
|
||||
# populated the root profile dir. The slot exists so
|
||||
# ``hermes gateway start`` (no ``-p``) has somewhere to land;
|
||||
# auto-up only when the prior state was "running" (same rule as
|
||||
# named profiles).
|
||||
default_prior_state = _read_prior_state(hermes_home)
|
||||
# named profiles). If the container was launched with the legacy
|
||||
# `gateway run` command and no state exists yet, seed that intent
|
||||
# as `running` so the s6 reconciler preserves the pre-s6 behavior.
|
||||
legacy_default_state = _maybe_migrate_legacy_gateway_run_state(
|
||||
hermes_home,
|
||||
container_argv=container_argv,
|
||||
dry_run=dry_run,
|
||||
)
|
||||
default_prior_state = legacy_default_state or _read_prior_state(hermes_home)
|
||||
default_should_start = default_prior_state in _AUTOSTART_STATES
|
||||
if not dry_run:
|
||||
_cleanup_stale_runtime_files(hermes_home)
|
||||
@ -147,6 +157,66 @@ def reconcile_profile_gateways(
|
||||
return actions
|
||||
|
||||
|
||||
def _maybe_migrate_legacy_gateway_run_state(
|
||||
hermes_home: Path,
|
||||
*,
|
||||
container_argv: Sequence[str] | None,
|
||||
dry_run: bool,
|
||||
) -> str | None:
|
||||
"""Seed root gateway_state for pre-s6 `gateway run` containers.
|
||||
|
||||
The tini image let Docker users run the gateway as the container
|
||||
command (`docker run ... gateway run`). After the s6 migration,
|
||||
profile gateways are restored from persisted gateway_state.json; a
|
||||
legacy container with no state file would therefore register the
|
||||
default service down and never start. Only synthesize state when no
|
||||
root gateway_state.json exists so explicit stopped/failed states keep
|
||||
winning across restarts.
|
||||
"""
|
||||
state_file = hermes_home / "gateway_state.json"
|
||||
if state_file.exists():
|
||||
return None
|
||||
|
||||
if os.environ.get("HERMES_GATEWAY_NO_SUPERVISE", "").lower() in ("1", "true", "yes"):
|
||||
return None
|
||||
|
||||
argv = tuple(container_argv) if container_argv is not None else _read_container_argv()
|
||||
if not _is_legacy_gateway_run_request(argv):
|
||||
return None
|
||||
|
||||
if not dry_run:
|
||||
import time
|
||||
state_file.write_text(json.dumps({
|
||||
"gateway_state": "running",
|
||||
"timestamp": int(time.time()),
|
||||
"migrated_from": "legacy-container-cmd",
|
||||
}) + "\n")
|
||||
return "running"
|
||||
|
||||
|
||||
def _read_container_argv() -> tuple[str, ...]:
|
||||
"""Best-effort read of the container PID 1 argv."""
|
||||
try:
|
||||
raw = Path("/proc/1/cmdline").read_bytes()
|
||||
except OSError:
|
||||
return ()
|
||||
return tuple(part.decode("utf-8", "replace") for part in raw.split(b"\0") if part)
|
||||
|
||||
|
||||
def _is_legacy_gateway_run_request(argv: Sequence[str]) -> bool:
|
||||
"""Return True for Docker commands equivalent to `gateway run`."""
|
||||
args = list(argv)
|
||||
if args and Path(args[0]).name == "init":
|
||||
args = args[1:]
|
||||
if args and args[0].endswith("main-wrapper.sh"):
|
||||
args = args[1:]
|
||||
if args and Path(args[0]).name == "hermes":
|
||||
args = args[1:]
|
||||
if "--no-supervise" in args:
|
||||
return False
|
||||
return len(args) >= 2 and args[0] == "gateway" and args[1] == "run"
|
||||
|
||||
|
||||
def _read_prior_state(profile_dir: Path) -> str | None:
|
||||
"""Read gateway_state.json's ``gateway_state`` field, or None if
|
||||
missing or unparseable. Unparseable counts as "no prior state" so
|
||||
|
||||
@ -484,6 +484,88 @@ def test_default_slot_autostarts_when_root_state_running(tmp_path: Path) -> None
|
||||
assert not (scandir / "gateway-default" / "down").exists()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"container_argv",
|
||||
[
|
||||
("gateway", "run"),
|
||||
("/init", "/opt/hermes/docker/main-wrapper.sh", "gateway", "run"),
|
||||
],
|
||||
)
|
||||
def test_legacy_gateway_run_cmd_seeds_default_running_state(
|
||||
tmp_path: Path,
|
||||
container_argv: tuple[str, ...],
|
||||
) -> None:
|
||||
"""Pre-s6 Docker users often ran `gateway run` as the container
|
||||
command. With no persisted gateway_state.json yet, s6 reconciliation
|
||||
must migrate that legacy intent into a running default gateway slot."""
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
|
||||
actions = reconcile_profile_gateways(
|
||||
hermes_home=tmp_path,
|
||||
scandir=scandir,
|
||||
dry_run=False,
|
||||
container_argv=container_argv,
|
||||
)
|
||||
|
||||
default_action = next(a for a in actions if a.profile == "default")
|
||||
assert default_action.prior_state == "running"
|
||||
assert default_action.action == "started"
|
||||
assert not (scandir / "gateway-default" / "down").exists()
|
||||
state = json.loads((tmp_path / "gateway_state.json").read_text())
|
||||
assert state["gateway_state"] == "running"
|
||||
assert state["migrated_from"] == "legacy-container-cmd"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"container_argv",
|
||||
[
|
||||
("gateway", "run", "--no-supervise"),
|
||||
("/init", "/opt/hermes/docker/main-wrapper.sh", "gateway", "run", "--no-supervise"),
|
||||
],
|
||||
)
|
||||
def test_legacy_gateway_run_no_supervise_does_not_seed_s6_state(
|
||||
tmp_path: Path,
|
||||
container_argv: tuple[str, ...],
|
||||
) -> None:
|
||||
"""`gateway run --no-supervise` is an explicit opt-out from s6 migration."""
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
|
||||
actions = reconcile_profile_gateways(
|
||||
hermes_home=tmp_path,
|
||||
scandir=scandir,
|
||||
dry_run=False,
|
||||
container_argv=container_argv,
|
||||
)
|
||||
|
||||
default_action = next(a for a in actions if a.profile == "default")
|
||||
assert default_action.prior_state is None
|
||||
assert default_action.action == "registered"
|
||||
assert (scandir / "gateway-default" / "down").exists()
|
||||
assert not (tmp_path / "gateway_state.json").exists()
|
||||
|
||||
|
||||
def test_legacy_gateway_run_env_no_supervise_does_not_seed_s6_state(
|
||||
tmp_path: Path,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""Env opt-out matches the CLI `--no-supervise` flag."""
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
monkeypatch.setenv("HERMES_GATEWAY_NO_SUPERVISE", "1")
|
||||
|
||||
actions = reconcile_profile_gateways(
|
||||
hermes_home=tmp_path,
|
||||
scandir=scandir,
|
||||
dry_run=False,
|
||||
container_argv=("gateway", "run"),
|
||||
)
|
||||
|
||||
default_action = next(a for a in actions if a.profile == "default")
|
||||
assert default_action.prior_state is None
|
||||
assert default_action.action == "registered"
|
||||
assert (scandir / "gateway-default" / "down").exists()
|
||||
assert not (tmp_path / "gateway_state.json").exists()
|
||||
|
||||
|
||||
def test_default_slot_does_not_autostart_when_root_state_stopped(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
@ -491,12 +573,17 @@ def test_default_slot_does_not_autostart_when_root_state_stopped(
|
||||
_seed_default_root(tmp_path, state="stopped")
|
||||
|
||||
actions = reconcile_profile_gateways(
|
||||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
hermes_home=tmp_path,
|
||||
scandir=scandir,
|
||||
dry_run=False,
|
||||
container_argv=("gateway", "run"),
|
||||
)
|
||||
|
||||
default_action = next(a for a in actions if a.profile == "default")
|
||||
assert default_action.action == "registered"
|
||||
assert (scandir / "gateway-default" / "down").exists()
|
||||
state = json.loads((tmp_path / "gateway_state.json").read_text())
|
||||
assert state["gateway_state"] == "stopped"
|
||||
|
||||
|
||||
def test_default_slot_does_not_autostart_when_root_state_startup_failed(
|
||||
|
||||
Reference in New Issue
Block a user