fix(code-exec): restore approval context in execute_code RPC threads + guard entry

Wrap both execute_code RPC threads (local UDS + remote file-RPC) with propagate_context_to_thread so gateway sessions no longer fall into check_dangerous_command's non-interactive auto-approve branch and the CLI approval prompt stays reachable. Add check_execute_code_guard: one-shot fail-closed approval of the whole script in gateway/ask/cron-deny before the child spawns (skips isolated backends; command-string built only past the early returns). Drop the broad HERMES_ env passthrough for an explicit operational allowlist plus DSN/WEBHOOK secret substrings, and update the POSIX-equivalence oracle. Refs #4146, #27303, #30882, #33057
2026-05-28 17:47:09 -04:00
parent 21aeefe5fd
commit 1083977261
3 changed files with 354 additions and 114 deletions
--- a/tests/tools/test_code_execution_windows_env.py
+++ b/tests/tools/test_code_execution_windows_env.py
@ -253,20 +253,24 @@ class TestWindowsSocketSmokeTest:
 # ---------------------------------------------------------------------------

 def _legacy_posix_scrubber(source_env, is_passthrough):
-    """Verbatim copy of the pre-Windows-fix inline scrubbing logic.
+    """Independent oracle for TestPosixEquivalence — a from-scratch reimpl of
+    _scrub_child_env's POSIX behavior, used to prove the production helper does
+    what we think it does.

-    This is the oracle used by TestPosixEquivalence to prove the refactor
-    did not change POSIX behavior.  DO NOT edit this to "match" a future
-    production change — if _scrub_child_env's POSIX behavior legitimately
-    needs to evolve, delete this function and adjust the equivalence test
-    on purpose, so the churn is visible in review.
+    Deliberately updated for #27303 (the broad ``HERMES_`` prefix was dropped
+    in favor of an explicit operational allowlist, and DSN/WEBHOOK were added
+    to the secret substrings).  The original docstring said: if POSIX behavior
+    legitimately needs to evolve, adjust this oracle on purpose so the churn is
+    visible in review — that is what this change is.
    """
    _SAFE_ENV_PREFIXES = ("PATH", "HOME", "USER", "LANG", "LC_", "TERM",
                          "TMPDIR", "TMP", "TEMP", "SHELL", "LOGNAME",
-                          "XDG_", "PYTHONPATH", "VIRTUAL_ENV", "CONDA",
-                          "HERMES_")
+                          "XDG_", "PYTHONPATH", "VIRTUAL_ENV", "CONDA")
    _SECRET_SUBSTRINGS = ("KEY", "TOKEN", "SECRET", "PASSWORD", "CREDENTIAL",
-                          "PASSWD", "AUTH")
+                          "PASSWD", "AUTH", "DSN", "WEBHOOK")
+    _HERMES_CHILD_ALLOWED = frozenset({
+        "HERMES_HOME", "HERMES_PROFILE", "HERMES_CONFIG", "HERMES_ENV",
+    })
    out = {}
    for k, v in source_env.items():
        if is_passthrough(k):
@ -276,6 +280,9 @@ def _legacy_posix_scrubber(source_env, is_passthrough):
            continue
        if any(k.startswith(p) for p in _SAFE_ENV_PREFIXES):
            out[k] = v
+            continue
+        if k in _HERMES_CHILD_ALLOWED:
+            out[k] = v
    return out


@ -308,13 +315,20 @@ class TestPosixEquivalence:
        "PYTHONPATH": "/opt/lib",
        "VIRTUAL_ENV": "/home/alice/.venv",
        "CONDA_PREFIX": "/opt/conda",
-        "HERMES_HOME": "/home/alice/.hermes",
-        "HERMES_INTERACTIVE": "1",
+        # HERMES_* handling (#27303): only the operational allowlist passes;
+        # every other HERMES_* is dropped (the broad prefix was removed).
+        "HERMES_HOME": "/home/alice/.hermes",        # allowlisted → kept
+        "HERMES_PROFILE": "default",                 # allowlisted → kept
+        "HERMES_INTERACTIVE": "1",                   # not allowlisted → dropped
+        "HERMES_BASE_URL": "https://api.internal",   # not allowlisted → dropped
+        "HERMES_KANBAN_DB": "postgres://u:p@h/db",   # not allowlisted → dropped
        # Secret-substring blocks
        "OPENAI_API_KEY": "sk-xxx",
        "GITHUB_TOKEN": "ghp_xxx",
        "AWS_SECRET_ACCESS_KEY": "yyy",
        "MY_PASSWORD": "hunter2",
+        "SENTRY_DSN": "https://abc@sentry.io/1",     # DSN substring → blocked
+        "SLACK_WEBHOOK": "https://hooks.slack/x",    # WEBHOOK substring → blocked
        # Uncategorized — must be dropped
        "RANDOM_UNKNOWN": "drop-me",
        "DISPLAY": ":0",
--- a/tools/approval.py
+++ b/tools/approval.py
@ -1057,6 +1057,107 @@ def _format_tirith_description(tirith_result: dict) -> str:
    return "Security scan — " + "; ".join(parts)


+def _await_gateway_decision(session_key: str, notify_cb, approval_data: dict,
+                            *, surface: str = "gateway") -> dict:
+    """Enqueue *approval_data*, notify the user, and block the calling agent
+    thread until the request is resolved or the gateway approval timeout
+    elapses — firing pre/post approval hooks and cleaning up the queue entry.
+
+    Shared by the terminal command guard (``check_all_command_guards``) and
+    the execute_code guard (``check_execute_code_guard``) so the fiddly
+    heartbeat-polling wait loop lives in one place.
+
+    Returns ``{"resolved": bool, "choice": str|None}`` on completion, or
+    ``{"resolved": False, "choice": None, "notify_failed": True}`` if the
+    notify callback raised.  Persistence of an approved choice and building
+    the final tool-facing result dict remain the caller's responsibility.
+    """
+    command = approval_data.get("command", "")
+    description = approval_data.get("description", "")
+    primary_key = approval_data.get("pattern_key", "")
+    all_keys = approval_data.get("pattern_keys", [primary_key])
+
+    entry = _ApprovalEntry(approval_data)
+    with _lock:
+        _gateway_queues.setdefault(session_key, []).append(entry)
+
+    def _drop_entry() -> None:
+        with _lock:
+            queue = _gateway_queues.get(session_key, [])
+            if entry in queue:
+                queue.remove(entry)
+            if not queue:
+                _gateway_queues.pop(session_key, None)
+
+    # Notify plugins that an approval is being requested. Fires before the
+    # gateway notify callback so observers get the event in real time.
+    _fire_approval_hook(
+        "pre_approval_request",
+        command=command,
+        description=description,
+        pattern_key=primary_key,
+        pattern_keys=list(all_keys),
+        session_key=session_key,
+        surface=surface,
+    )
+
+    # Notify the user (bridges sync agent thread → async gateway)
+    try:
+        notify_cb(approval_data)
+    except Exception as exc:
+        logger.warning("Gateway approval notify failed: %s", exc)
+        _drop_entry()
+        return {"resolved": False, "choice": None, "notify_failed": True}
+
+    # Block until the user responds or timeout (default 5 min). Poll in short
+    # slices so we can fire activity heartbeats every ~10s to the agent's
+    # inactivity tracker — otherwise the gateway watchdog kills the agent
+    # while the user is still responding. Mirrors _wait_for_process() cadence.
+    timeout = _get_approval_config().get("gateway_timeout", 300)
+    try:
+        timeout = int(timeout)
+    except (ValueError, TypeError):
+        timeout = 300
+
+    try:
+        from tools.environments.base import touch_activity_if_due
+    except Exception:  # pragma: no cover
+        touch_activity_if_due = None
+
+    _now = time.monotonic()
+    _deadline = _now + max(timeout, 0)
+    _activity_state = {"last_touch": _now, "start": _now}
+    resolved = False
+    while True:
+        _remaining = _deadline - time.monotonic()
+        if _remaining <= 0:
+            break
+        if entry.event.wait(timeout=min(1.0, _remaining)):
+            resolved = True
+            break
+        if touch_activity_if_due is not None:
+            touch_activity_if_due(_activity_state, "waiting for user approval")
+
+    _drop_entry()
+
+    choice = entry.result
+    # Normalize outcome for the post hook. Unresolved (timeout) and None both
+    # mean the user never responded; report that explicitly so plugins can
+    # distinguish timeout from explicit deny.
+    _outcome = "timeout" if not resolved else (choice if choice else "timeout")
+    _fire_approval_hook(
+        "post_approval_response",
+        command=command,
+        description=description,
+        pattern_key=primary_key,
+        pattern_keys=list(all_keys),
+        session_key=session_key,
+        surface=surface,
+        choice=_outcome,
+    )
+    return {"resolved": resolved, "choice": choice}
+
+
 def check_all_command_guards(command: str, env_type: str,
                             approval_callback=None) -> dict:
    """Run all pre-exec security checks and return a single approval decision.
@ -1207,113 +1308,27 @@ def check_all_command_guards(command: str, env_type: str,

        if notify_cb is not None:
            # --- Blocking gateway approval (queue-based) ---
-            # Each call gets its own _ApprovalEntry so parallel subagents
-            # and execute_code threads can block concurrently.
+            # Block the agent thread until the user responds; the notify +
+            # heartbeat wait loop is shared with check_execute_code_guard via
+            # _await_gateway_decision().
            approval_data = {
                "command": command,
                "pattern_key": primary_key,
                "pattern_keys": all_keys,
                "description": combined_desc,
            }
-            entry = _ApprovalEntry(approval_data)
-            with _lock:
-                _gateway_queues.setdefault(session_key, []).append(entry)
-
-            # Notify plugins that an approval is being requested. Fires before
-            # the gateway notify callback so observers (e.g. macOS notifier
-            # plugins, audit logs, Slack alerts) get the event in real time.
-            _fire_approval_hook(
-                "pre_approval_request",
-                command=command,
-                description=combined_desc,
-                pattern_key=primary_key,
-                pattern_keys=list(all_keys),
-                session_key=session_key,
-                surface="gateway",
+            decision = _await_gateway_decision(
+                session_key, notify_cb, approval_data, surface="gateway"
            )
-
-            # Notify the user (bridges sync agent thread → async gateway)
-            try:
-                notify_cb(approval_data)
-            except Exception as exc:
-                logger.warning("Gateway approval notify failed: %s", exc)
-                with _lock:
-                    queue = _gateway_queues.get(session_key, [])
-                    if entry in queue:
-                        queue.remove(entry)
-                    if not queue:
-                        _gateway_queues.pop(session_key, None)
+            if decision.get("notify_failed"):
                return {
                    "approved": False,
                    "message": "BLOCKED: Failed to send approval request to user. Do NOT retry.",
                    "pattern_key": primary_key,
                    "description": combined_desc,
                }
-
-            # Block until the user responds or timeout (default 5 min).
-            # Poll in short slices so we can fire activity heartbeats every
-            # ~10s to the agent's inactivity tracker.  Without this, the
-            # blocking event.wait() never touches activity, and the
-            # gateway's inactivity watchdog (agent.gateway_timeout, default
-            # 1800s) kills the agent while the user is still responding to
-            # the approval prompt.  Mirrors the _wait_for_process() cadence
-            # in tools/environments/base.py.
-            timeout = _get_approval_config().get("gateway_timeout", 300)
-            try:
-                timeout = int(timeout)
-            except (ValueError, TypeError):
-                timeout = 300
-
-            try:
-                from tools.environments.base import touch_activity_if_due
-            except Exception:  # pragma: no cover
-                touch_activity_if_due = None
-
-            _now = time.monotonic()
-            _deadline = _now + max(timeout, 0)
-            _activity_state = {"last_touch": _now, "start": _now}
-            resolved = False
-            while True:
-                _remaining = _deadline - time.monotonic()
-                if _remaining <= 0:
-                    break
-                # 1s poll slice — the event is set immediately when the
-                # user responds, so slice length only controls heartbeat
-                # cadence, not user-visible responsiveness.
-                if entry.event.wait(timeout=min(1.0, _remaining)):
-                    resolved = True
-                    break
-                if touch_activity_if_due is not None:
-                    touch_activity_if_due(
-                        _activity_state, "waiting for user approval"
-                    )
-
-            # Clean up this entry from the queue
-            with _lock:
-                queue = _gateway_queues.get(session_key, [])
-                if entry in queue:
-                    queue.remove(entry)
-                if not queue:
-                    _gateway_queues.pop(session_key, None)
-
-            choice = entry.result
-            # Normalize outcome for the post hook. Unresolved (timeout) and
-            # None both mean the user never responded; report that explicitly
-            # so plugins can distinguish timeout from explicit deny.
-            _outcome = (
-                "timeout" if not resolved
-                else (choice if choice else "timeout")
-            )
-            _fire_approval_hook(
-                "post_approval_response",
-                command=command,
-                description=combined_desc,
-                pattern_key=primary_key,
-                pattern_keys=list(all_keys),
-                session_key=session_key,
-                surface="gateway",
-                choice=_outcome,
-            )
+            resolved = decision["resolved"]
+            choice = decision["choice"]

            if not resolved or choice is None or choice == "deny":
                # Consent contract: silence is NOT consent, and an explicit
@ -1437,5 +1452,173 @@ def check_all_command_guards(command: str, env_type: str,
            "user_approved": True, "description": combined_desc}


+def check_execute_code_guard(code: str, env_type: str) -> dict:
+    """Approve an execute_code script before its child process is spawned.
+
+    execute_code runs arbitrary local Python — the script can call
+    ``subprocess``, ``os.system``, ``ctypes``, or other process/file APIs
+    directly, none of which pass through ``terminal()`` /
+    ``DANGEROUS_PATTERNS``. In gateway/ask contexts we fail closed by approving
+    the script as a whole before it runs (#30882). Returns the same dict
+    contract as ``check_all_command_guards``.
+
+    Scope (documented limitation, #30882): in a purely local non-interactive
+    non-gateway session (no TTY, not gateway, not cron-deny) this returns
+    approved — matching the existing terminal auto-approve contract. The
+    hardline floor still blocks catastrophic ``terminal()`` commands the script
+    issues; running arbitrary code headlessly without any approval surface is
+    trusted-by-config (set a gateway/ask surface or ``approvals.cron_mode`` to
+    require approval).
+    """
+    pattern_key = "execute_code"
+    description = (
+        "execute_code script execution. The script can spawn subprocesses or "
+        "mutate files without passing through terminal command approval; "
+        "approval is one-shot for this run."
+    )
+
+    # Isolated backends already sandbox the child — matches the container skip
+    # in check_all_command_guards / check_dangerous_command.
+    if env_type in {"docker", "singularity", "modal", "daytona", "vercel_sandbox"}:
+        return {"approved": True, "message": None}
+
+    # --yolo or approvals.mode=off: bypass (session- or process-scoped).
+    approval_mode = _get_approval_mode()
+    if _YOLO_MODE_FROZEN or is_current_session_yolo_enabled() or approval_mode == "off":
+        return {"approved": True, "message": None}
+
+    is_gateway = _is_gateway_approval_context()
+    is_ask = env_var_enabled("HERMES_EXEC_ASK")
+
+    # Cron: no user is present to approve arbitrary code.
+    if env_var_enabled("HERMES_CRON_SESSION"):
+        if _get_cron_approval_mode() == "deny":
+            return {
+                "approved": False,
+                "message": (
+                    "BLOCKED: execute_code runs arbitrary local Python "
+                    "(including subprocess calls that bypass shell-string "
+                    "approval checks). Cron jobs run without a user present "
+                    "to approve it. Use normal tools instead, or set "
+                    "approvals.cron_mode: approve only if this cron profile "
+                    "is intentionally trusted."
+                ),
+                "pattern_key": pattern_key,
+                "description": description,
+                "outcome": "blocked",
+                "user_consent": False,
+            }
+        return {"approved": True, "message": None}
+
+    # Only gateway/ask contexts get the one-shot whole-script approval.
+    #   * CLI interactive: the script's terminal() calls are guarded per-call
+    #     (context now propagates into the RPC thread, #33057); a whole-script
+    #     prompt would fire on every execute_code call.
+    #   * Local non-interactive non-gateway: documented limitation above.
+    if not is_gateway and not is_ask:
+        return {"approved": True, "message": None}
+
+    session_key = get_current_session_key()
+    # Built only now (past the early-return gates) so the common non-approval
+    # paths don't pay to copy a potentially-large script into this string.
+    command = f"execute_code <<'PY'\n{code}\nPY"
+
+    # Smart mode: ask the aux LLM about the whole script. An APPROVE here only
+    # suppresses the redundant whole-script prompt; the per-call terminal()
+    # guards (restored by context propagation) still run independently.
+    if approval_mode == "smart":
+        verdict = _smart_approve(command, description)
+        if verdict == "approve":
+            logger.debug("Smart approval: auto-approved execute_code for session %s",
+                         session_key)
+            return {"approved": True, "message": None,
+                    "smart_approved": True, "description": description}
+        if verdict == "deny":
+            return {
+                "approved": False,
+                "message": ("BLOCKED by smart approval: execute_code script "
+                            "execution was assessed as genuinely dangerous. "
+                            "Do NOT retry."),
+                "smart_denied": True,
+                "pattern_key": pattern_key,
+                "description": description,
+                "outcome": "denied",
+                "user_consent": False,
+            }
+        # verdict == "escalate" → fall through to manual approval
+
+    notify_cb = None
+    with _lock:
+        notify_cb = _gateway_notify_cbs.get(session_key)
+
+    if notify_cb is None:
+        # No gateway callback registered (e.g. ask-mode without a notifier):
+        # surface a pending approval for backward compatibility.
+        submit_pending(session_key, {
+            "command": command,
+            "pattern_key": pattern_key,
+            "pattern_keys": [pattern_key],
+            "description": description,
+        })
+        return {
+            "approved": False,
+            "pattern_key": pattern_key,
+            "status": "pending_approval",
+            "approval_pending": True,
+            "command": command,
+            "description": description,
+            "message": (
+                f"⚠️ {description}. Asking the user for approval.\n\n"
+                f"**Code:**\n```python\n{code}\n```"
+            ),
+        }
+
+    approval_data = {
+        "command": command,
+        "pattern_key": pattern_key,
+        "pattern_keys": [pattern_key],
+        "description": description,
+    }
+    decision = _await_gateway_decision(
+        session_key, notify_cb, approval_data, surface="gateway"
+    )
+    if decision.get("notify_failed"):
+        return {
+            "approved": False,
+            "message": ("BLOCKED: Failed to send execute_code approval request "
+                        "to user. Do NOT retry."),
+            "pattern_key": pattern_key,
+            "description": description,
+            "outcome": "notify_failed",
+            "user_consent": False,
+        }
+
+    resolved = decision["resolved"]
+    choice = decision["choice"]
+
+    if not resolved or choice is None or choice == "deny":
+        reason = "timed out without user response" if not resolved else "denied by user"
+        addendum = " Silence is not consent." if not resolved else ""
+        return {
+            "approved": False,
+            "message": (
+                f"BLOCKED: execute_code script {reason}. The user has NOT "
+                f"consented to running this code. Do NOT retry, do NOT rephrase "
+                f"the script, and do NOT attempt the same outcome via a "
+                f"different tool.{addendum}"
+            ),
+            "pattern_key": pattern_key,
+            "description": description,
+            "outcome": "timeout" if not resolved else "denied",
+            "user_consent": False,
+        }
+
+    # Approved — one-shot only. Deliberately NO approve_session/approve_permanent:
+    # each execute_code script is distinct arbitrary code, so approval never
+    # persists to future scripts.
+    return {"approved": True, "message": None,
+            "user_approved": True, "description": description}
+
+
 # Load permanent allowlist from config on module import
 load_permanent_allowlist()
--- a/tools/code_execution_tool.py
+++ b/tools/code_execution_tool.py
@ -46,6 +46,8 @@ import uuid
 _IS_WINDOWS = platform.system() == "Windows"
 from typing import Any, Dict, List, Optional

+from tools.thread_context import propagate_context_to_thread
+
 # Availability gate.  On Windows we fall back to loopback TCP for the
 # sandbox RPC transport (AF_UNIX is unreliable on Windows Python) — see
 # ``_use_tcp_rpc`` in ``_execute_local`` below.  That makes execute_code
@ -74,13 +76,30 @@ MAX_STDERR_BYTES = 10_000    # 10 KB

 # Environment variable scrubbing rules (shared between the local + remote
 # backends).  Secret-substring block is applied first; anything left must
-# match either a safe prefix or, on Windows, an OS-essential name.
+# match a safe prefix, the operational HERMES_ allowlist, or (on Windows) an
+# OS-essential name.
+#
+# NB: the broad "HERMES_" prefix was deliberately removed (#27303) — it leaked
+# HERMES_*-named config that lacks a secret substring (e.g. HERMES_BASE_URL,
+# HERMES_KANBAN_DB, HERMES_*_WEBHOOK).  The child only needs the few
+# location/profile vars in _HERMES_CHILD_ALLOWED below; HERMES_RPC_SOCKET /
+# HERMES_RPC_DIR / TZ / HOME are injected explicitly after scrubbing.
 _SAFE_ENV_PREFIXES = ("PATH", "HOME", "USER", "LANG", "LC_", "TERM",
                      "TMPDIR", "TMP", "TEMP", "SHELL", "LOGNAME",
-                      "XDG_", "PYTHONPATH", "VIRTUAL_ENV", "CONDA",
-                      "HERMES_")
+                      "XDG_", "PYTHONPATH", "VIRTUAL_ENV", "CONDA")
 _SECRET_SUBSTRINGS = ("KEY", "TOKEN", "SECRET", "PASSWORD", "CREDENTIAL",
-                      "PASSWD", "AUTH")
+                      "PASSWD", "AUTH", "DSN", "WEBHOOK")
+
+# Operational HERMES_* vars the child legitimately needs by exact name — these
+# are non-secret runtime-location flags (the same set hermes_cli treats as the
+# runtime location) that repo-root modules a sandbox script imports may read at
+# import time.  None match _SECRET_SUBSTRINGS.
+_HERMES_CHILD_ALLOWED = frozenset({
+    "HERMES_HOME",
+    "HERMES_PROFILE",
+    "HERMES_CONFIG",
+    "HERMES_ENV",
+})

 # Windows-only: a handful of variables are required by the OS/CRT itself.
 # Without them, even stdlib calls like ``socket.socket()`` fail with
@ -119,9 +138,10 @@ def _scrub_child_env(source_env, is_passthrough=None, is_windows=None):

    Rules (order matters):
      1. Passthrough vars (skill- or config-declared) always pass.
-      2. Secret-substring names (KEY/TOKEN/etc.) are blocked.
+      2. Secret-substring names (KEY/TOKEN/DSN/WEBHOOK/etc.) are blocked.
      3. Names matching a safe prefix pass.
-      4. On Windows, a small OS-essential allowlist passes by exact name
+      4. Operational HERMES_* vars (_HERMES_CHILD_ALLOWED) pass by exact name.
+      5. On Windows, a small OS-essential allowlist passes by exact name
         — without these the child can't even create a socket or spawn a
         subprocess.

@ -147,6 +167,9 @@ def _scrub_child_env(source_env, is_passthrough=None, is_windows=None):
        if any(k.startswith(p) for p in _SAFE_ENV_PREFIXES):
            scrubbed[k] = v
            continue
+        if k in _HERMES_CHILD_ALLOWED:
+            scrubbed[k] = v
+            continue
        if is_windows and k.upper() in _WINDOWS_ESSENTIAL_ENV_VARS:
            scrubbed[k] = v
    return scrubbed
@ -887,9 +910,11 @@ def _execute_remote(
        _ship_file_to_remote(env, f"{sandbox_dir}/hermes_tools.py", tools_src)
        _ship_file_to_remote(env, f"{sandbox_dir}/script.py", code)

-        # Start RPC polling thread
+        # Wrapped so the thread inherits the turn's approval context + callbacks
+        # (see tools.thread_context) — else sandbox RPC tool calls lose approval
+        # routing (#33057).
        rpc_thread = threading.Thread(
-            target=_rpc_poll_loop,
+            target=propagate_context_to_thread(_rpc_poll_loop),
            args=(
                env, f"{sandbox_dir}/rpc", effective_task_id,
                tool_call_log, tool_call_counter, max_tool_calls,
@ -1049,6 +1074,21 @@ def execute_code(
    # Dispatch: remote backends use file-based RPC, local uses UDS
    from tools.terminal_tool import _get_env_config
    env_type = _get_env_config()["env_type"]
+
+    # execute_code runs arbitrary Python (subprocess/os.system/...) that never
+    # passes through terminal()/DANGEROUS_PATTERNS, so guard the whole script
+    # here before either dispatch path spawns it. Runs synchronously in the
+    # caller (tool-executor) thread, which holds the session context (#30882).
+    from tools.approval import check_execute_code_guard
+    _guard = check_execute_code_guard(code, env_type)
+    if not _guard.get("approved", False):
+        return json.dumps({
+            "status": "error",
+            "error": _guard.get("message") or "execute_code blocked by approval guard.",
+            "tool_calls_made": 0,
+            "duration_seconds": 0,
+        }, ensure_ascii=False)
+
    if env_type != "local":
        return _execute_remote(code, task_id, enabled_tools)

@ -1135,8 +1175,11 @@ def execute_code(
            os.chmod(sock_path, 0o600)
        server_sock.listen(1)

+        # Wrapped so the thread inherits the turn's approval context + callbacks
+        # (see tools.thread_context) — else gateway sandbox tool calls silently
+        # auto-approve dangerous commands (#33057, #30882).
        rpc_thread = threading.Thread(
-            target=_rpc_server_loop,
+            target=propagate_context_to_thread(_rpc_server_loop),
            args=(
                server_sock, task_id, tool_call_log,
                tool_call_counter, max_tool_calls, sandbox_tools,