From bdceedf784292040b38b8e09a5ee5bbdff7e2258 Mon Sep 17 00:00:00 2001 From: Ben Barclay Date: Mon, 1 Jun 2026 14:38:08 +1000 Subject: [PATCH] fix(docker): chown hermes-owned top-level state files on boot (#35098) (#36236) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The targeted data-volume chown in stage2-hook.sh only covers hermes-owned *subdirectories*; loose state files living directly under $HERMES_HOME (auth.json, state.db, gateway.lock, gateway_state.json, …) are missed. When created or rewritten by `docker exec hermes …` (root unless `-u` is passed) they land root-owned, and the unprivileged hermes runtime then hits PermissionError on next startup, producing a gateway restart loop. Fix: reset ownership of an explicit allowlist of hermes-owned top-level files on every boot. The list mirrors the top-level file entries of hermes_cli.profile_distribution.USER_OWNED_EXCLUDE plus the runtime lock files. This uses a targeted allowlist rather than the originally-proposed blanket `find $HERMES_HOME -maxdepth 1 -user root` sweep, preserving the targeted-ownership contract from #19788 / PR #19795: a bind-mounted $HERMES_HOME may contain host-owned files Hermes does not manage, and those must never be chowned. Verified end-to-end: allowlisted root-owned files are reset to hermes on restart while a non-allowlisted host file keeps its root ownership. Co-authored-by: x1am1 <2663402852@qq.com> --- docker/stage2-hook.sh | 27 ++++ scripts/release.py | 1 + .../tools/test_stage2_hook_toplevel_chown.py | 138 ++++++++++++++++++ 3 files changed, 166 insertions(+) create mode 100644 tests/tools/test_stage2_hook_toplevel_chown.py diff --git a/docker/stage2-hook.sh b/docker/stage2-hook.sh index 18668e7fc..d1dbe136f 100755 --- a/docker/stage2-hook.sh +++ b/docker/stage2-hook.sh @@ -187,6 +187,33 @@ if [ -d "$HERMES_HOME/profiles" ]; then chown -R hermes:hermes "$HERMES_HOME/profiles" 2>/dev/null || true fi +# Reset ownership of hermes-owned top-level state files on every boot. +# The targeted data-volume chown above only covers hermes-owned +# *subdirectories*; loose state files living directly under $HERMES_HOME +# are missed. When those files are created or rewritten by +# `docker exec hermes …` (root unless `-u` is passed) they +# land root-owned, and the unprivileged hermes runtime then hits +# PermissionError on next startup (e.g. gateway.lock / state.db / +# auth.json), producing a gateway restart loop. +# +# We use an explicit allowlist rather than a blanket `find -user root` +# sweep so host-owned files in a bind-mounted $HERMES_HOME are never +# touched — same targeted-ownership contract as the subdir chown above +# (issue #19788, PR #19795). The list mirrors the top-level *file* +# entries of hermes_cli.profile_distribution.USER_OWNED_EXCLUDE plus the +# runtime lock files; keep them in sync if that set changes. +for f in \ + auth.json auth.lock .env \ + state.db state.db-shm state.db-wal \ + hermes_state.db \ + response_store.db response_store.db-shm response_store.db-wal \ + gateway.pid gateway.lock gateway_state.json processes.json \ + active_profile; do + if [ -e "$HERMES_HOME/$f" ]; then + chown hermes:hermes "$HERMES_HOME/$f" 2>/dev/null || true + fi +done + # --- config.yaml permissions --- # Ensure config.yaml is readable by the hermes runtime user even if it # was edited on the host after initial ownership setup. diff --git a/scripts/release.py b/scripts/release.py index e82da3d51..f3f8ade6d 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -1412,6 +1412,7 @@ AUTHOR_MAP = { "116212274+amathxbt@users.noreply.github.com": "amathxbt", # PR #22155 (cache tool_output_limits) "takis312@hotmail.com": "ErnestHysa", # PRs #32636/#32708 (MCP asyncio.sleep + O(n^2) watcher drain) "me@simontaggart.com": "SiTaggart", # PR #35583 (docker_forward_env empty-secret .env fallback) + "2663402852@qq.com": "x1am1", # PR #35098 (chown root-owned top-level HERMES_HOME state files) } diff --git a/tests/tools/test_stage2_hook_toplevel_chown.py b/tests/tools/test_stage2_hook_toplevel_chown.py new file mode 100644 index 000000000..1ad2eea12 --- /dev/null +++ b/tests/tools/test_stage2_hook_toplevel_chown.py @@ -0,0 +1,138 @@ +"""Contract test: the s6-overlay stage2 hook resets ownership of hermes-owned +top-level state files in $HERMES_HOME — but only those, never arbitrary +host-owned files. + +Regression guard for the gateway restart loop reported in #35098: files such +as gateway.lock / state.db / auth.json live directly under $HERMES_HOME (not in +a subdir), so the targeted subdir chown misses them. When created or rewritten +by `docker exec hermes …` (root unless `-u` is passed) they land +root-owned and the unprivileged hermes runtime then hits PermissionError on next +startup. + +The fix uses an explicit allowlist rather than a blanket `find -user root` +sweep, preserving the targeted-ownership contract from #19788 / PR #19795: a +bind-mounted $HERMES_HOME may contain host-owned files Hermes does not manage, +and those must never be chowned. + +The s6-overlay rework moved bootstrap from docker/entrypoint.sh (now a shim) to +docker/stage2-hook.sh, installed as /etc/cont-init.d/01-hermes-setup. This test +targets that location. +""" +from __future__ import annotations + +import os +import re +import shutil +import subprocess +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[2] +STAGE2_HOOK = REPO_ROOT / "docker" / "stage2-hook.sh" + + +@pytest.fixture(scope="module") +def stage2_text() -> str: + if not STAGE2_HOOK.exists(): + pytest.skip("docker/stage2-hook.sh not present in this checkout") + return STAGE2_HOOK.read_text() + + +def _toplevel_chown_loop(text: str) -> str: + """Extract the `for f in … chown hermes:hermes "$HERMES_HOME/$f" … done` + block that repairs top-level state-file ownership.""" + m = re.search( + r"(for f in \\\n(?:.*\\\n)*?.*; do\n(?:.*\n)*?done)", + text, + ) + assert m, "stage2-hook.sh must contain the top-level-file chown for-loop (#35098)" + block = m.group(1) + assert 'chown hermes:hermes "$HERMES_HOME/$f"' in block, ( + "the top-level-file loop must chown each allowlisted file to hermes" + ) + return block + + +def test_toplevel_chown_loop_present(stage2_text: str) -> None: + block = _toplevel_chown_loop(stage2_text) + # The reported-broken files must be covered. + for required in ("auth.json", "state.db", "gateway.lock", "gateway_state.json"): + assert required in block, ( + f"top-level chown allowlist must include {required!r} (#35098)" + ) + + +def test_no_blanket_find_user_root_sweep(stage2_text: str) -> None: + """The fix must NOT reintroduce a blanket `find … -user root` chown of + $HERMES_HOME contents — that would clobber host-owned files in a bind mount + (#19788 / PR #19795).""" + assert not re.search(r"find\s+\"?\$\{?HERMES_HOME\}?\"?[^\n]*-user\s+root", stage2_text), ( + "stage2-hook.sh must not blanket-chown root-owned files under " + "$HERMES_HOME via `find -user root`; use the targeted allowlist instead " + "so host-owned bind-mounted files are preserved (#19788, #19795)." + ) + + +def _run_loop(text: str, present_files: list[str]) -> list[str]: + """Run the extracted chown loop in a sandbox $HERMES_HOME, with `chown` + stubbed to record which paths it was asked to touch. Returns the basenames + the loop attempted to chown.""" + bash = shutil.which("bash") + if bash is None: + pytest.skip("bash not available") + block = _toplevel_chown_loop(text) + + import tempfile + + with tempfile.TemporaryDirectory() as d: + dpath = Path(d) + home = dpath / "home" + home.mkdir() + for f in present_files: + (home / f).touch() + # A non-allowlisted, "host-owned" file that must never be chowned. + (home / "host_secret.json").touch() + + # Stub chown to record the basename of its last argument (the path), + # so we observe exactly which files the allowlist loop selected + # without needing real root privileges. + script = ( + "set -e\n" + f'HERMES_HOME="{home}"\n' + f'chown() {{ for a in "$@"; do :; done; echo "${{a##*/}}" >> "{dpath}/chown.log"; }}\n' + + block + ) + script_path = dpath / "harness.sh" + script_path.write_text(script) + + proc = subprocess.run([bash, str(script_path)], capture_output=True, text=True) + assert proc.returncode == 0, proc.stderr + + log = dpath / "chown.log" + if not log.exists(): + return [] + return [ln for ln in log.read_text().splitlines() if ln] + + +def test_loop_chowns_present_allowlisted_files(stage2_text: str) -> None: + touched = _run_loop(stage2_text, ["auth.json", "state.db", "gateway.lock"]) + assert "auth.json" in touched + assert "state.db" in touched + assert "gateway.lock" in touched + + +def test_loop_skips_nonallowlisted_host_file(stage2_text: str) -> None: + """A file NOT on the allowlist (e.g. a host-owned file in a bind mount) must + never be chowned, even if present.""" + touched = _run_loop(stage2_text, ["auth.json"]) + assert "host_secret.json" not in touched, ( + "the allowlist loop must not touch non-allowlisted files (#19788)" + ) + + +def test_loop_skips_absent_files(stage2_text: str) -> None: + """Allowlisted files that don't exist are skipped (no spurious chown).""" + touched = _run_loop(stage2_text, ["auth.json"]) + # state.db wasn't created, so it must not appear. + assert "state.db" not in touched