fix(tools): don't compound-rewrite spawn_via_env background wrappers
Background tasks on non-local backends (SSH/Docker/Modal/Daytona/Singularity)
go through `ProcessRegistry.spawn_via_env`, which builds a hand-crafted,
shell-safe wrapper:
mkdir -p T && ( nohup bash -lc CMD > LOG 2>&1; rc=$?; ... ) & echo $! > PID && cat PID
`BaseEnvironment.execute()` unconditionally ran `_rewrite_compound_background`
on every command, including this wrapper. The rewrite (meant to defuse the
`A && B &` subshell-wait trap for user commands) turns `( ... ) & echo $!` into
`{ ( ... ) & } echo $!` — note `} echo` with no separator, which is a bash
syntax error. The wrapper then never produces a PID, the redirected output file
is never created, and the agent sees an immediate exit code -1. This breaks
*every* background launch on a non-local backend (e.g. a simple
count-and-redirect script over SSH), not just edge cases.
Fix:
- Add `rewrite_compound_background: bool = True` to `BaseEnvironment.execute()`
(and the `BaseModalExecutionEnvironment` override, which accepts and ignores
it). Default preserves existing behavior; the user foreground terminal path
still rewrites.
- `spawn_via_env` passes `rewrite_compound_background=False` so its already
shell-safe wrapper is left intact.
- Treat a wrapper that produces no PID as a failed launch (mark the session
exited with a real exit code instead of exposing a fake running session), and
don't register/checkpoint a session that never started.
Verified empirically: with the rewrite skipped, the wrapper is valid bash,
launches the process, captures the PID, and writes the log/pid/exit files; the
old rewritten form fails `bash -n` with a syntax error.
Based on #33756 by @CharZhou (extracted from a multi-feature branch; the
unrelated image_gen / docker-media changes are not included here).
Co-authored-by: CharZhou <17255546+CharZhou@users.noreply.github.com>
This commit is contained in:
@ -49,6 +49,7 @@ AUTHOR_MAP = {
|
||||
"mathijs.vd.hurk@gmail.com": "mathijsvandenhurk",
|
||||
"drpelagik@gmail.com": "SeaXen",
|
||||
"lengr@users.noreply.github.com": "LengR",
|
||||
"17255546+CharZhou@users.noreply.github.com": "CharZhou",
|
||||
"metalclaudbot@gmail.com": "HashClawAI",
|
||||
"tonybear55665566@gmail.com": "TonyPepeBear",
|
||||
"kaspersniels@gmail.com": "nielskaspers",
|
||||
|
||||
@ -481,8 +481,8 @@ class TestSpawnEnvSanitization:
|
||||
def get_temp_dir(self):
|
||||
return "/data/data/com.termux/files/usr/tmp"
|
||||
|
||||
def execute(self, command, timeout=None):
|
||||
self.commands.append((command, timeout))
|
||||
def execute(self, command, **kwargs):
|
||||
self.commands.append((command, kwargs))
|
||||
return {"output": "4321\n"}
|
||||
|
||||
env = FakeEnv()
|
||||
@ -501,6 +501,52 @@ class TestSpawnEnvSanitization:
|
||||
assert "cat /tmp/hermes_bg_" not in bg_command
|
||||
fake_thread.start.assert_called_once()
|
||||
|
||||
def test_spawn_via_env_checks_returncode_when_wrapper_fails(self, registry):
|
||||
class FakeEnv:
|
||||
def __init__(self):
|
||||
self.commands = []
|
||||
|
||||
def execute(self, command, **kwargs):
|
||||
self.commands.append((command, kwargs))
|
||||
return {"output": "syntax error", "returncode": 2}
|
||||
|
||||
env = FakeEnv()
|
||||
fake_thread = MagicMock()
|
||||
|
||||
with patch("tools.process_registry.threading.Thread", return_value=fake_thread), \
|
||||
patch.object(registry, "_write_checkpoint"):
|
||||
session = registry.spawn_via_env(env, "echo hello")
|
||||
|
||||
assert session.exited is True
|
||||
assert session.exit_code == 2
|
||||
assert session.pid is None
|
||||
assert session.output_buffer == "syntax error"
|
||||
fake_thread.start.assert_not_called()
|
||||
# A failed launch must not be exposed as a running/tracked session.
|
||||
assert session.id not in registry._running
|
||||
|
||||
def test_spawn_via_env_disables_rewrite_for_bg_wrapper(self, registry):
|
||||
class FakeEnv:
|
||||
def __init__(self):
|
||||
self.commands = []
|
||||
|
||||
def get_temp_dir(self):
|
||||
return "/tmp"
|
||||
|
||||
def execute(self, command, **kwargs):
|
||||
self.commands.append((command, kwargs))
|
||||
return {"output": "4321\n", "returncode": 0}
|
||||
|
||||
env = FakeEnv()
|
||||
fake_thread = MagicMock()
|
||||
|
||||
with patch("tools.process_registry.threading.Thread", return_value=fake_thread), \
|
||||
patch.object(registry, "_write_checkpoint"):
|
||||
registry.spawn_via_env(env, "echo hello")
|
||||
|
||||
args, kwargs = env.commands[0]
|
||||
assert kwargs.get("rewrite_compound_background") is False
|
||||
|
||||
def test_env_poller_quotes_temp_paths_with_spaces(self, registry):
|
||||
session = _make_session(sid="proc_space")
|
||||
session.exited = False
|
||||
@ -514,8 +560,8 @@ class TestSpawnEnvSanitization:
|
||||
{"output": "0\n"},
|
||||
])
|
||||
|
||||
def execute(self, command, timeout=None):
|
||||
self.commands.append((command, timeout))
|
||||
def execute(self, command, **kwargs):
|
||||
self.commands.append((command, kwargs))
|
||||
return next(self._responses)
|
||||
|
||||
env = FakeEnv()
|
||||
|
||||
@ -833,18 +833,18 @@ class BaseEnvironment(ABC):
|
||||
*,
|
||||
timeout: int | None = None,
|
||||
stdin_data: str | None = None,
|
||||
rewrite_compound_background: bool = True,
|
||||
) -> dict:
|
||||
"""Execute a command, return {"output": str, "returncode": int}."""
|
||||
self._before_execute()
|
||||
|
||||
exec_command, sudo_stdin = self._prepare_command(command)
|
||||
# Guard against the `A && B &` subshell-wait trap: bash forks a
|
||||
# subshell for the compound that then waits for an infinite B (a
|
||||
# server, `yes > /dev/null`, etc.), leaking the subshell forever.
|
||||
# Rewriting to `A && { B & }` runs B as a plain background in the
|
||||
# current shell — no subshell wait.
|
||||
from tools.terminal_tool import _rewrite_compound_background
|
||||
exec_command = _rewrite_compound_background(exec_command)
|
||||
# Guard against the `A && B &` subshell-wait trap by default.
|
||||
# Some callers (spawn_via_env) already produce shell-safe wrappers and
|
||||
# pass rewrite_compound_background=False.
|
||||
if rewrite_compound_background:
|
||||
from tools.terminal_tool import _rewrite_compound_background
|
||||
exec_command = _rewrite_compound_background(exec_command)
|
||||
effective_timeout = timeout or self.timeout
|
||||
effective_cwd = cwd or self.cwd
|
||||
|
||||
@ -893,4 +893,3 @@ class BaseEnvironment(ABC):
|
||||
from tools.terminal_tool import _transform_sudo_command
|
||||
|
||||
return _transform_sudo_command(command)
|
||||
|
||||
|
||||
@ -79,7 +79,12 @@ class BaseModalExecutionEnvironment(BaseEnvironment):
|
||||
*,
|
||||
timeout: int | None = None,
|
||||
stdin_data: str | None = None,
|
||||
rewrite_compound_background: bool = True,
|
||||
) -> dict:
|
||||
# Managed/remote modal transports execute commands via explicit transport
|
||||
# and do not rely on shell background rewriters. Keep parameter for
|
||||
# compatibility with BaseEnvironment callers.
|
||||
_ = rewrite_compound_background
|
||||
self._before_execute()
|
||||
prepared = self._prepare_modal_exec(
|
||||
command,
|
||||
|
||||
@ -697,7 +697,11 @@ class ProcessRegistry:
|
||||
)
|
||||
|
||||
try:
|
||||
result = env.execute(bg_command, timeout=timeout)
|
||||
result = env.execute(
|
||||
bg_command,
|
||||
timeout=timeout,
|
||||
rewrite_compound_background=False,
|
||||
)
|
||||
output = result.get("output", "").strip()
|
||||
# Try to extract the PID from the output
|
||||
for line in output.splitlines():
|
||||
@ -705,6 +709,15 @@ class ProcessRegistry:
|
||||
if line.isdigit():
|
||||
session.pid = int(line)
|
||||
break
|
||||
# If the wrapper couldn't produce a PID (for example, syntax
|
||||
# error or broken redirect), treat it as a failed launch instead
|
||||
# of exposing a fake running session.
|
||||
if session.pid is None:
|
||||
session.exited = True
|
||||
session.exit_code = int(result.get("returncode", -1))
|
||||
if session.exit_code == 0:
|
||||
session.exit_code = -1
|
||||
session.output_buffer = result.get("output", "").strip()
|
||||
except Exception as e:
|
||||
session.exited = True
|
||||
session.exit_code = -1
|
||||
@ -723,9 +736,12 @@ class ProcessRegistry:
|
||||
|
||||
with self._lock:
|
||||
self._prune_if_needed()
|
||||
self._running[session.id] = session
|
||||
if not session.exited:
|
||||
self._running[session.id] = session
|
||||
|
||||
if not session.exited:
|
||||
self._write_checkpoint()
|
||||
|
||||
self._write_checkpoint()
|
||||
return session
|
||||
|
||||
# ----- Reader / Poller Threads -----
|
||||
|
||||
Reference in New Issue
Block a user