From 343c54e35bfe8682dcf597aea1f0ea5278864156 Mon Sep 17 00:00:00 2001 From: Ben Barclay Date: Thu, 4 Jun 2026 10:51:51 +1000 Subject: [PATCH] fix(docker): reject unsupported --user start with clear guidance (#38579) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `docker run --user $(id -u):$(id -g)` was a tini-era trick to make container-written files match the host user. Under s6-overlay it no longer works: the bootstrap (UID remap, volume + build-tree chown, config seeding) needs root, and the baked image dirs (/opt/data, /opt/hermes/.venv, ui-tui, node_modules) are owned by the hermes build UID (10000). A pinned arbitrary UID can't write them, so the runtime fails with EACCES on a bind mount or hard-crashes on a named volume (Docker inits the volume from the image as 10000; the non-root start can't even `cd /opt/data`, and the profile reconciler dies with PermissionError on gateway_state.json). Detect that start early in both the cont-init hook (stage2-hook.sh) and the CMD wrapper (main-wrapper.sh) and fail fast with actionable guidance pointing at the supported path: root start + HERMES_UID/HERMES_GID (or the PUID/PGID aliases), which remaps the hermes user and chowns the volume — the same host-UID-matching outcome --user was used for, without breaking s6. The guard fires only when the current UID is neither root NOR the hermes UID. This preserves the supported non-root start from #34648/#34837 (running with `--user 10000:10000`, i.e. pinned to the hermes UID itself), which is unaffected — only the arbitrary-UID variant that #34837 never actually made writable is rejected. Verified live across five scenarios (built image, bind + named volume): arbitrary --user on bind -> rejected with guidance, hermes does not run; arbitrary --user on named volume -> guidance shown, no raw 'can't cd' crash; --user 10000:10000 -> boots; root + HERMES_UID=4242 remap -> boots, guard not tripped; default root start -> boots. Pre-fix control reproduces the raw PermissionError + 'can't cd' crash with no guidance. --- docker/main-wrapper.sh | 28 +++++ docker/stage2-hook.sh | 50 ++++++++ .../tools/test_stage2_hook_user_flag_guard.py | 119 ++++++++++++++++++ 3 files changed, 197 insertions(+) create mode 100644 tests/tools/test_stage2_hook_user_flag_guard.py diff --git a/docker/main-wrapper.sh b/docker/main-wrapper.sh index 7a0b04d62..20d8c709a 100755 --- a/docker/main-wrapper.sh +++ b/docker/main-wrapper.sh @@ -21,6 +21,34 @@ set -e drop() { [ "$(id -u)" = 0 ] && set -- s6-setuidgid hermes "$@"; exec "$@"; } +# --- Reject the unsupported `docker run --user :` start --- +# Mirror the guard in stage2-hook.sh (cont-init). This is the surface the +# user actually sees in `docker run` output: when the container is pinned to +# an arbitrary non-root, non-hermes UID, the bootstrap was skipped and the +# baked image dirs (owned by the hermes build UID) are unwritable, so fail +# fast here with actionable guidance rather than crashing on `cd`/EACCES +# further down. See stage2-hook.sh for the full rationale. +cur_uid="$(id -u)" +if [ "$cur_uid" != 0 ] && [ "$cur_uid" != "$(id -u hermes)" ]; then + cat >&2 <:` start --- +# Detect the case where the container was launched with `--user` pinned to an +# arbitrary host UID (the classic `--user $(id -u):$(id -g)` invocation people +# used in the tini era to make container-written files match their host user). +# +# Under s6-overlay this no longer works: the bootstrap (UID remap, volume + +# build-tree chown, config seeding) all require root, and they're skipped when +# the container starts non-root. The baked image trees (/opt/data, /opt/hermes/ +# .venv, ui-tui, node_modules) stay owned by the hermes build UID (10000), so an +# arbitrary `--user` UID can't write them — the runtime then fails with EACCES +# on a bind mount, or hard-crashes on a named volume (Docker initialises the +# volume from the image as UID 10000, and the non-root start can't even `cd` +# into $HERMES_HOME). See #34837 for the supervision-tree side of this. +# +# The supported way to match host-side ownership is to start as root (the image +# default) and pass HERMES_UID/HERMES_GID — or the PUID/PGID aliases — which the +# remap block below consumes via usermod/groupmod + targeted chown. That gives +# the exact same outcome (files owned by your host UID) without breaking s6. +# +# preinit runs setuid-root (euid=0) but cont-init.d hooks run with the real UID +# the container was started as, so `id -u` here is the host UID (e.g. 1000), and +# `id -u hermes` is the unremapped build UID (10000) because no root-only remap +# could run. root starts (id -u = 0) and the normal supervised drop to the +# hermes UID are both unaffected. +cur_uid="$(id -u)" +if [ "$cur_uid" != 0 ] && [ "$cur_uid" != "$(id -u hermes)" ]; then + cat >&2 <:` start with actionable +guidance, while still allowing: + + - root start (id -u == 0) + - `--user ` (the supported non-root start, #34648 / #34837) + +Background: in the tini era `docker run --user $(id -u):$(id -g)` was used to +make container-written files match the host user. Under s6-overlay this can't +work — the bootstrap (UID remap, volume/build-tree chown, config seeding) needs +root, and the baked image dirs are owned by the hermes build UID, so an +arbitrary pinned UID can't write them (EACCES on a bind mount, hard crash on a +named volume). The supported path is root start + HERMES_UID/HERMES_GID (or the +PUID/PGID aliases), which remaps the hermes user and chowns the volume. + +The guard fires only when the current UID is neither root NOR the hermes UID, +so the #34648 `--user 10000:10000` case (pinning to the hermes UID itself) is +unaffected. + +Extraction + stubbed-shell-run mirrors +tests/tools/test_stage2_hook_toplevel_chown.py. +""" +from __future__ import annotations + +import re +import shutil +import subprocess +import tempfile +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[2] +STAGE2_HOOK = REPO_ROOT / "docker" / "stage2-hook.sh" +MAIN_WRAPPER = REPO_ROOT / "docker" / "main-wrapper.sh" + + +def _read(p: Path) -> str: + if not p.exists(): + pytest.skip(f"{p} not present in this checkout") + return p.read_text() + + +def _guard_block(text: str) -> str: + """Extract the `cur_uid=...; if [ ... ]; then ... exit 1; fi` guard.""" + m = re.search( + r"(cur_uid=\"\$\(id -u\)\"\nif \[ \"\$cur_uid\" != 0 \](?:.*\n)*?fi)", + text, + ) + assert m, "expected the --user guard block (cur_uid + non-root/non-hermes check)" + return m.group(1) + + +@pytest.mark.parametrize("path", [STAGE2_HOOK, MAIN_WRAPPER]) +def test_guard_present_and_mentions_remediation(path: Path) -> None: + text = _read(path) + block = _guard_block(text) + # Must check non-root AND non-hermes-uid (so --user 10000:10000 is allowed). + assert '"$cur_uid" != 0' in block + assert '"$cur_uid" != "$(id -u hermes)"' in block + assert "exit 1" in block + # Must point users at the supported env vars. + assert "HERMES_UID" in block and "HERMES_GID" in block + assert "PUID" in block and "PGID" in block + + +def _run_guard(text: str, *, cur_uid: int, hermes_uid: int = 10000) -> subprocess.CompletedProcess: + """Run the extracted guard with `id` stubbed. Returns the completed process + (rc 1 + stderr message when rejected, rc 0 when allowed through).""" + bash = shutil.which("bash") + if bash is None: + pytest.skip("bash not available") + block = _guard_block(text) + with tempfile.TemporaryDirectory() as d: + script = ( + "set -e\n" + # Stub `id`: `id -u` -> cur_uid; `id -u hermes` -> hermes_uid. + f'id() {{ if [ "$2" = hermes ]; then echo {hermes_uid}; else echo {cur_uid}; fi; }}\n' + + block + + "\necho GUARD_PASSED\n" # only reached when the guard allows through + ) + sp = Path(d) / "h.sh" + sp.write_text(script) + return subprocess.run([bash, str(sp)], capture_output=True, text=True) + + +def test_arbitrary_user_uid_is_rejected() -> None: + """An arbitrary host UID (1000), neither root nor hermes, is rejected.""" + for text in (_read(STAGE2_HOOK), _read(MAIN_WRAPPER)): + proc = _run_guard(text, cur_uid=1000, hermes_uid=10000) + assert proc.returncode == 1, f"expected rejection, got rc={proc.returncode}" + assert "not supported" in proc.stderr + assert "GUARD_PASSED" not in proc.stdout + + +def test_root_start_passes() -> None: + """Root start (uid 0) is never blocked.""" + for text in (_read(STAGE2_HOOK), _read(MAIN_WRAPPER)): + proc = _run_guard(text, cur_uid=0, hermes_uid=10000) + assert proc.returncode == 0, proc.stderr + assert "GUARD_PASSED" in proc.stdout + + +def test_user_pinned_to_hermes_uid_passes() -> None: + """`--user 10000:10000` (the hermes UID itself) is the supported non-root + start from #34648 / #34837 and must NOT be blocked.""" + for text in (_read(STAGE2_HOOK), _read(MAIN_WRAPPER)): + proc = _run_guard(text, cur_uid=10000, hermes_uid=10000) + assert proc.returncode == 0, proc.stderr + assert "GUARD_PASSED" in proc.stdout + + +def test_user_pinned_to_remapped_hermes_uid_passes() -> None: + """After a HERMES_UID remap the hermes UID is e.g. 4242; a container pinned + to that same UID must still pass (cur_uid == hermes_uid).""" + for text in (_read(STAGE2_HOOK), _read(MAIN_WRAPPER)): + proc = _run_guard(text, cur_uid=4242, hermes_uid=4242) + assert proc.returncode == 0, proc.stderr + assert "GUARD_PASSED" in proc.stdout