fix(tools): don't compound-rewrite spawn_via_env background wrappers

Background tasks on non-local backends (SSH/Docker/Modal/Daytona/Singularity)
go through `ProcessRegistry.spawn_via_env`, which builds a hand-crafted,
shell-safe wrapper:

    mkdir -p T && ( nohup bash -lc CMD > LOG 2>&1; rc=$?; ... ) & echo $! > PID && cat PID

`BaseEnvironment.execute()` unconditionally ran `_rewrite_compound_background`
on every command, including this wrapper. The rewrite (meant to defuse the
`A && B &` subshell-wait trap for user commands) turns `( ... ) & echo $!` into
`{ ( ... ) & } echo $!` — note `} echo` with no separator, which is a bash
syntax error. The wrapper then never produces a PID, the redirected output file
is never created, and the agent sees an immediate exit code -1. This breaks
*every* background launch on a non-local backend (e.g. a simple
count-and-redirect script over SSH), not just edge cases.

Fix:
- Add `rewrite_compound_background: bool = True` to `BaseEnvironment.execute()`
  (and the `BaseModalExecutionEnvironment` override, which accepts and ignores
  it). Default preserves existing behavior; the user foreground terminal path
  still rewrites.
- `spawn_via_env` passes `rewrite_compound_background=False` so its already
  shell-safe wrapper is left intact.
- Treat a wrapper that produces no PID as a failed launch (mark the session
  exited with a real exit code instead of exposing a fake running session), and
  don't register/checkpoint a session that never started.

Verified empirically: with the rewrite skipped, the wrapper is valid bash,
launches the process, captures the PID, and writes the log/pid/exit files; the
old rewritten form fails `bash -n` with a syntax error.

Based on #33756 by @CharZhou (extracted from a multi-feature branch; the
unrelated image_gen / docker-media changes are not included here).

Co-authored-by: CharZhou <17255546+CharZhou@users.noreply.github.com>
This commit is contained in:
kshitijk4poor
2026-06-01 00:05:10 +05:30
parent 1044d9f25d
commit 6f8975dcd8
5 changed files with 82 additions and 15 deletions

View File

@ -49,6 +49,7 @@ AUTHOR_MAP = {
"mathijs.vd.hurk@gmail.com": "mathijsvandenhurk",
"drpelagik@gmail.com": "SeaXen",
"lengr@users.noreply.github.com": "LengR",
"17255546+CharZhou@users.noreply.github.com": "CharZhou",
"metalclaudbot@gmail.com": "HashClawAI",
"tonybear55665566@gmail.com": "TonyPepeBear",
"kaspersniels@gmail.com": "nielskaspers",

View File

@ -481,8 +481,8 @@ class TestSpawnEnvSanitization:
def get_temp_dir(self):
return "/data/data/com.termux/files/usr/tmp"
def execute(self, command, timeout=None):
self.commands.append((command, timeout))
def execute(self, command, **kwargs):
self.commands.append((command, kwargs))
return {"output": "4321\n"}
env = FakeEnv()
@ -501,6 +501,52 @@ class TestSpawnEnvSanitization:
assert "cat /tmp/hermes_bg_" not in bg_command
fake_thread.start.assert_called_once()
def test_spawn_via_env_checks_returncode_when_wrapper_fails(self, registry):
class FakeEnv:
def __init__(self):
self.commands = []
def execute(self, command, **kwargs):
self.commands.append((command, kwargs))
return {"output": "syntax error", "returncode": 2}
env = FakeEnv()
fake_thread = MagicMock()
with patch("tools.process_registry.threading.Thread", return_value=fake_thread), \
patch.object(registry, "_write_checkpoint"):
session = registry.spawn_via_env(env, "echo hello")
assert session.exited is True
assert session.exit_code == 2
assert session.pid is None
assert session.output_buffer == "syntax error"
fake_thread.start.assert_not_called()
# A failed launch must not be exposed as a running/tracked session.
assert session.id not in registry._running
def test_spawn_via_env_disables_rewrite_for_bg_wrapper(self, registry):
class FakeEnv:
def __init__(self):
self.commands = []
def get_temp_dir(self):
return "/tmp"
def execute(self, command, **kwargs):
self.commands.append((command, kwargs))
return {"output": "4321\n", "returncode": 0}
env = FakeEnv()
fake_thread = MagicMock()
with patch("tools.process_registry.threading.Thread", return_value=fake_thread), \
patch.object(registry, "_write_checkpoint"):
registry.spawn_via_env(env, "echo hello")
args, kwargs = env.commands[0]
assert kwargs.get("rewrite_compound_background") is False
def test_env_poller_quotes_temp_paths_with_spaces(self, registry):
session = _make_session(sid="proc_space")
session.exited = False
@ -514,8 +560,8 @@ class TestSpawnEnvSanitization:
{"output": "0\n"},
])
def execute(self, command, timeout=None):
self.commands.append((command, timeout))
def execute(self, command, **kwargs):
self.commands.append((command, kwargs))
return next(self._responses)
env = FakeEnv()

View File

@ -833,18 +833,18 @@ class BaseEnvironment(ABC):
*,
timeout: int | None = None,
stdin_data: str | None = None,
rewrite_compound_background: bool = True,
) -> dict:
"""Execute a command, return {"output": str, "returncode": int}."""
self._before_execute()
exec_command, sudo_stdin = self._prepare_command(command)
# Guard against the `A && B &` subshell-wait trap: bash forks a
# subshell for the compound that then waits for an infinite B (a
# server, `yes > /dev/null`, etc.), leaking the subshell forever.
# Rewriting to `A && { B & }` runs B as a plain background in the
# current shell — no subshell wait.
from tools.terminal_tool import _rewrite_compound_background
exec_command = _rewrite_compound_background(exec_command)
# Guard against the `A && B &` subshell-wait trap by default.
# Some callers (spawn_via_env) already produce shell-safe wrappers and
# pass rewrite_compound_background=False.
if rewrite_compound_background:
from tools.terminal_tool import _rewrite_compound_background
exec_command = _rewrite_compound_background(exec_command)
effective_timeout = timeout or self.timeout
effective_cwd = cwd or self.cwd
@ -893,4 +893,3 @@ class BaseEnvironment(ABC):
from tools.terminal_tool import _transform_sudo_command
return _transform_sudo_command(command)

View File

@ -79,7 +79,12 @@ class BaseModalExecutionEnvironment(BaseEnvironment):
*,
timeout: int | None = None,
stdin_data: str | None = None,
rewrite_compound_background: bool = True,
) -> dict:
# Managed/remote modal transports execute commands via explicit transport
# and do not rely on shell background rewriters. Keep parameter for
# compatibility with BaseEnvironment callers.
_ = rewrite_compound_background
self._before_execute()
prepared = self._prepare_modal_exec(
command,

View File

@ -697,7 +697,11 @@ class ProcessRegistry:
)
try:
result = env.execute(bg_command, timeout=timeout)
result = env.execute(
bg_command,
timeout=timeout,
rewrite_compound_background=False,
)
output = result.get("output", "").strip()
# Try to extract the PID from the output
for line in output.splitlines():
@ -705,6 +709,15 @@ class ProcessRegistry:
if line.isdigit():
session.pid = int(line)
break
# If the wrapper couldn't produce a PID (for example, syntax
# error or broken redirect), treat it as a failed launch instead
# of exposing a fake running session.
if session.pid is None:
session.exited = True
session.exit_code = int(result.get("returncode", -1))
if session.exit_code == 0:
session.exit_code = -1
session.output_buffer = result.get("output", "").strip()
except Exception as e:
session.exited = True
session.exit_code = -1
@ -723,9 +736,12 @@ class ProcessRegistry:
with self._lock:
self._prune_if_needed()
self._running[session.id] = session
if not session.exited:
self._running[session.id] = session
if not session.exited:
self._write_checkpoint()
self._write_checkpoint()
return session
# ----- Reader / Poller Threads -----