From 90b3c54de97267015aa3b65d7330edd3f3c01f91 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Fri, 29 May 2026 12:16:57 -0700 Subject: [PATCH] fix: drain thread no longer crashes on fd-less stdout streams (#34789) * docs(code-execution): document HERMES_* env narrowing + passthrough workaround The execute_code sandbox-child env scrub (108397726, #27303) deliberately dropped the broad HERMES_ prefix passthrough, keeping only an operational 4-var allowlist (HERMES_HOME/PROFILE/CONFIG/ENV). A script that relied on a non-secret HERMES_* var (HERMES_BASE_URL, HERMES_KANBAN_DB, HERMES_*_WEBHOOK, or a plugin-defined one) now sees it unset in the child. Document the behavior change and the two recovery routes (terminal.env_passthrough in config.yaml, or required_environment_variables in skill frontmatter), plus the debug log line that surfaces the drop for diagnosis. * fix: drain thread no longer crashes on fd-less stdout streams The _wait_for_process drain thread called proc.stdout.fileno() unconditionally. ProcessHandle implementations whose stdout is not backed by a real OS fd (iterator-style in-memory streams, mock procs) raised 'list_iterator' object has no attribute 'fileno' (or 'fileno() returned a non-integer' from select.select), killing the daemon thread and silently losing all process output. Resolve the fd defensively at the top of _drain; when stdout has no usable integer fileno, fall back to draining it as an iterable (the legacy 'for line in proc.stdout' contract). The real subprocess / os.pipe-backed select() fast path is unchanged. --- tools/environments/base.py | 44 ++++++++++++++- .../user-guide/features/code-execution.md | 56 +++++++++++++++++++ 2 files changed, 99 insertions(+), 1 deletion(-) diff --git a/tools/environments/base.py b/tools/environments/base.py index 2666990bf..618ea2bb9 100644 --- a/tools/environments/base.py +++ b/tools/environments/base.py @@ -524,8 +524,50 @@ class BaseEnvironment(ABC): # U+FFFD substitution rather than clobbering the whole buffer. decoder = codecs.getincrementaldecoder("utf-8")(errors="replace") + def _drain_iterable(stream): + # Fallback path: ``stream`` is not backed by a real OS file + # descriptor (no usable ``fileno()``). This covers in-memory + # ProcessHandle adapters that expose stdout as a plain iterator of + # already-collected output (the legacy ``for line in proc.stdout`` + # contract) rather than a live pipe. Iterate it to EOF. Without + # this, the drain thread would raise an unhandled exception and die + # silently, losing all of the process's output. + try: + for piece in stream: + if piece is None: + continue + if isinstance(piece, bytes): + output_chunks.append(decoder.decode(piece)) + else: + output_chunks.append(str(piece)) + except Exception: + pass + finally: + try: + tail = decoder.decode(b"", final=True) + if tail: + output_chunks.append(tail) + except Exception: + pass + def _drain(): - fd = proc.stdout.fileno() + # Resolve a real OS file descriptor up front. Real subprocesses and + # the SDK ``_ThreadedProcessHandle`` (os.pipe-backed) both return an + # integer fd here. Mocks / iterator-style stdout streams either lack + # ``fileno()`` entirely or return a non-integer — in that case fall + # back to draining the stream as an iterable instead of crashing the + # thread (issue: 'list_iterator' object has no attribute 'fileno'). + stream = proc.stdout + if stream is None: + return + fileno = getattr(stream, "fileno", None) + try: + fd = fileno() if callable(fileno) else None + except Exception: + fd = None + if not isinstance(fd, int) or fd < 0: + _drain_iterable(stream) + return # select.select does NOT work on pipe fds on Windows (only sockets). # Use blocking os.read in a daemon thread instead — safe because # EOF arrives promptly when bash exits. diff --git a/website/docs/user-guide/features/code-execution.md b/website/docs/user-guide/features/code-execution.md index 804984cbf..f3beaa473 100644 --- a/website/docs/user-guide/features/code-execution.md +++ b/website/docs/user-guide/features/code-execution.md @@ -219,6 +219,62 @@ terminal: See the [Security guide](/user-guide/security#environment-variable-passthrough) for full details. +### `HERMES_*` variables in the child + +The child process receives only a small, fixed set of operational `HERMES_*` +variables by exact name: + +- `HERMES_HOME` +- `HERMES_PROFILE` +- `HERMES_CONFIG` +- `HERMES_ENV` + +(plus `HERMES_RPC_DIR` / `HERMES_RPC_SOCKET` / `TZ` / `HOME`, which Hermes +injects explicitly so the RPC channel works). + +:::note Behavior change +Earlier versions passed **any** variable whose name began with `HERMES_` +through to the child. That broad prefix was removed for security hardening: it +could leak `HERMES_*`-named configuration that doesn't match a secret substring +(for example `HERMES_BASE_URL`, `HERMES_KANBAN_DB`, or a `HERMES_*_WEBHOOK` +endpoint) into arbitrary sandboxed code. + +If an `execute_code` script — or a repo/plugin module it imports at import time +— relied on a `HERMES_*` variable outside the four operational names above, it +will now find that variable **unset** in the child. The drop is intentional, +not a bug. +::: + +**Workaround — opt the variable back in explicitly.** Both routes pass the +variable through `execute_code` *and* `terminal` children, and neither weakens +the secret-stripping guarantee (Hermes-managed provider credentials can never +be re-allowed this way): + +1. **Per-machine, in `config.yaml`** — add the exact variable name to the + passthrough allowlist: + + ```yaml + terminal: + env_passthrough: + - HERMES_KANBAN_DB + - HERMES_BASE_URL + ``` + +2. **Per-skill, in the skill's frontmatter** — declare it so it is registered + automatically whenever that skill is loaded: + + ```yaml + required_environment_variables: + - HERMES_KANBAN_DB + ``` + +**Diagnosing it.** When the child drops one or more non-allowlisted `HERMES_*` +variables, Hermes emits a one-line `debug` log naming them and pointing at the +`env_passthrough` escape hatch. Run with debug logging (`hermes logs --level +DEBUG`, or check `~/.hermes/logs/agent.log`) and look for +`execute_code: dropped N non-allowlisted HERMES_* var(s)` if a script behaves +as though a `HERMES_*` variable is missing. + Hermes always writes the script and the auto-generated `hermes_tools.py` RPC stub into a temp staging directory that is cleaned up after execution. In `strict` mode the script also *runs* there; in `project` mode it runs in the session's working directory (the staging directory stays on `PYTHONPATH` so imports still resolve). The child process runs in its own process group so it can be cleanly killed on timeout or interruption. ## execute_code vs terminal