fix(docker): dashboard slot stays 'down' when HERMES_DASHBOARD unset
PR #30136 review caught a false positive: when HERMES_DASHBOARD was unset, the dashboard run script did `exec sleep infinity`, so `s6-svstat /run/service/dashboard` reported the slot as 'up'. `hermes doctor` and any other s6-svstat-based health check saw the dashboard as supervised-running even though no dashboard process existed. Add cont-init.d/03-dashboard-toggle: writes a `down` marker file into `/run/service/dashboard/` when HERMES_DASHBOARD is falsy, removes any leftover marker when it's truthy. s6-supervise honors `down` by not starting the service, so s6-svstat reports 'down' — matching reality. The run script's HERMES_DASHBOARD case-statement stays in place as a belt-and-suspenders guard, so the two layers can never disagree. Two new integration tests lock the behavior: slot reports down when unset; slot reports up when set to 1.
This commit is contained in:
@ -183,6 +183,7 @@ RUN mkdir -p /etc/cont-init.d && \
|
|||||||
> /etc/cont-init.d/01-hermes-setup && \
|
> /etc/cont-init.d/01-hermes-setup && \
|
||||||
chmod +x /etc/cont-init.d/01-hermes-setup
|
chmod +x /etc/cont-init.d/01-hermes-setup
|
||||||
COPY --chmod=0755 docker/cont-init.d/02-reconcile-profiles /etc/cont-init.d/02-reconcile-profiles
|
COPY --chmod=0755 docker/cont-init.d/02-reconcile-profiles /etc/cont-init.d/02-reconcile-profiles
|
||||||
|
COPY --chmod=0755 docker/cont-init.d/03-dashboard-toggle /etc/cont-init.d/03-dashboard-toggle
|
||||||
|
|
||||||
# ---------- Runtime ----------
|
# ---------- Runtime ----------
|
||||||
ENV HERMES_WEB_DIST=/opt/hermes/hermes_cli/web_dist
|
ENV HERMES_WEB_DIST=/opt/hermes/hermes_cli/web_dist
|
||||||
|
|||||||
55
docker/cont-init.d/03-dashboard-toggle
Executable file
55
docker/cont-init.d/03-dashboard-toggle
Executable file
@ -0,0 +1,55 @@
|
|||||||
|
#!/command/with-contenv sh
|
||||||
|
# shellcheck shell=sh
|
||||||
|
# Toggle the dashboard s6-rc service slot based on HERMES_DASHBOARD.
|
||||||
|
#
|
||||||
|
# Runs as root in cont-init.d, after 01-hermes-setup (stage2) and
|
||||||
|
# 02-reconcile-profiles, BEFORE s6-rc starts user services.
|
||||||
|
#
|
||||||
|
# Background (PR #30136 review item I3): the dashboard service was
|
||||||
|
# always declared as an s6-rc longrun, with its run script checking
|
||||||
|
# HERMES_DASHBOARD and `exec sleep infinity` when unset. Trouble:
|
||||||
|
# s6-svstat then reports the dashboard slot as "up" (because sleep
|
||||||
|
# IS running) even though no dashboard process exists. `hermes
|
||||||
|
# doctor` and any other s6-svstat-based health check sees a
|
||||||
|
# false-positive up-state.
|
||||||
|
#
|
||||||
|
# Fix: write a `down` marker file into the live service-dir when
|
||||||
|
# HERMES_DASHBOARD is unset / falsy. s6-supervise honors `down` by
|
||||||
|
# not starting the service at all, so s6-svstat reports `down` —
|
||||||
|
# matching reality.
|
||||||
|
#
|
||||||
|
# The run script's HERMES_DASHBOARD case-statement stays in place
|
||||||
|
# as a belt-and-suspenders guard: even if the down marker is
|
||||||
|
# removed at runtime and the service is brought up, the run script
|
||||||
|
# still bails when HERMES_DASHBOARD is unset. Both layers agree.
|
||||||
|
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
# Live service directory for the dashboard longrun. s6-overlay
|
||||||
|
# compiles /etc/s6-overlay/s6-rc.d/dashboard/ into this location
|
||||||
|
# at boot, before cont-init.d scripts run.
|
||||||
|
DASHBOARD_LIVE_DIR="/run/service/dashboard"
|
||||||
|
|
||||||
|
# If the live directory hasn't materialized yet (e.g. running in a
|
||||||
|
# stripped-down test image), nothing to do — the run script's env
|
||||||
|
# check still keeps things safe.
|
||||||
|
if [ ! -d "$DASHBOARD_LIVE_DIR" ]; then
|
||||||
|
echo "[dashboard-toggle] $DASHBOARD_LIVE_DIR not present; skipping"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
case "${HERMES_DASHBOARD:-}" in
|
||||||
|
1|true|TRUE|True|yes|YES|Yes)
|
||||||
|
# Enabled — remove any leftover down marker from a previous boot.
|
||||||
|
if [ -e "$DASHBOARD_LIVE_DIR/down" ]; then
|
||||||
|
rm -f "$DASHBOARD_LIVE_DIR/down"
|
||||||
|
echo "[dashboard-toggle] HERMES_DASHBOARD enabled; removed down marker"
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
# Disabled — write a down marker so s6-supervise won't start
|
||||||
|
# the service. s6-svstat will report it as down, matching reality.
|
||||||
|
touch "$DASHBOARD_LIVE_DIR/down"
|
||||||
|
echo "[dashboard-toggle] HERMES_DASHBOARD unset; marked dashboard slot down"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
@ -52,6 +52,60 @@ def test_dashboard_not_running_by_default(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_dashboard_slot_reports_down_when_disabled(
|
||||||
|
built_image: str, container_name: str,
|
||||||
|
) -> None:
|
||||||
|
"""Without HERMES_DASHBOARD, s6-svstat should report the dashboard
|
||||||
|
slot as DOWN (not up-with-sleep-infinity, which would
|
||||||
|
false-positive `hermes doctor` and any other health check).
|
||||||
|
|
||||||
|
Locks the PR #30136 review item I3 fix: cont-init.d/03-dashboard-toggle
|
||||||
|
writes a `down` marker file in the live service-dir when
|
||||||
|
HERMES_DASHBOARD is unset, so the slot reflects reality.
|
||||||
|
"""
|
||||||
|
subprocess.run(
|
||||||
|
["docker", "run", "-d", "--name", container_name, built_image,
|
||||||
|
"sleep", "60"],
|
||||||
|
check=True, capture_output=True, timeout=30,
|
||||||
|
)
|
||||||
|
time.sleep(5)
|
||||||
|
# /command/ isn't on PATH for docker-exec sessions, so call by
|
||||||
|
# absolute path.
|
||||||
|
r = docker_exec(
|
||||||
|
container_name, "/command/s6-svstat", "/run/service/dashboard",
|
||||||
|
)
|
||||||
|
assert r.returncode == 0, f"s6-svstat failed: {r.stderr!r} / {r.stdout!r}"
|
||||||
|
assert "down" in r.stdout, (
|
||||||
|
f"Dashboard slot should be 'down' without HERMES_DASHBOARD; "
|
||||||
|
f"svstat reports: {r.stdout!r}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_dashboard_slot_reports_up_when_enabled(
|
||||||
|
built_image: str, container_name: str,
|
||||||
|
) -> None:
|
||||||
|
"""Symmetry: with HERMES_DASHBOARD=1, s6-svstat reports the slot as up."""
|
||||||
|
subprocess.run(
|
||||||
|
["docker", "run", "-d", "--name", container_name,
|
||||||
|
"-e", "HERMES_DASHBOARD=1", built_image, "sleep", "120"],
|
||||||
|
check=True, capture_output=True, timeout=30,
|
||||||
|
)
|
||||||
|
# uvicorn takes a moment to bind; poll svstat.
|
||||||
|
deadline = time.monotonic() + 30.0
|
||||||
|
last = ""
|
||||||
|
while time.monotonic() < deadline:
|
||||||
|
r = docker_exec(
|
||||||
|
container_name, "/command/s6-svstat", "/run/service/dashboard",
|
||||||
|
)
|
||||||
|
last = r.stdout
|
||||||
|
if r.returncode == 0 and "up " in r.stdout:
|
||||||
|
return # success
|
||||||
|
time.sleep(0.5)
|
||||||
|
raise AssertionError(
|
||||||
|
f"Dashboard slot never reached up state; last svstat: {last!r}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_dashboard_opt_in_starts(
|
def test_dashboard_opt_in_starts(
|
||||||
built_image: str, container_name: str,
|
built_image: str, container_name: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|||||||
Reference in New Issue
Block a user