From 1c53d39eaaf2fc57a7fb5039911e28d07ad71cb1 Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Fri, 29 May 2026 04:11:09 -0700 Subject: [PATCH] test: deflake process-registry kill + PTY resize tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two CI flakes surfaced on PR #34572 (both in files this PR doesn't touch; pre-existing host-dependent flakes): 1. test_process_registry::TestPopenLeakOnSetupFailure — the failure-cleanup tests use a fake proc.pid (8888/9999) and assert proc.kill() runs. But spawn_local's primary cleanup is os.killpg(os.getpgid(pid), SIGKILL), falling back to proc.kill() only on ProcessLookupError/PermissionError/ OSError. When the fake PID happens to exist on a busy host, os.getpgid succeeds, os.killpg fires against an UNRELATED real process group, and proc.kill() is never reached -> flaky AssertionError (and a real risk of SIGKILLing an innocent process group from a unit test). Patch os.getpgid to raise ProcessLookupError so the fallback path runs deterministically and no real killpg is ever issued. 2. test_web_server::test_resize_escape_is_forwarded — the receive loop calls the blocking conn.receive_bytes() with no exception guard. Once the child prints its winsize and exits, the PTY closes; on a missed-marker run the next recv blocks until the 30s pytest-timeout instead of failing fast. Add a try/except break (matching the working sibling tests) and bump the child's pre-read sleep 0.15s -> 0.5s so the resize reliably lands first. Verified: 4/4 pass across 3 consecutive runs; root cause for #1 reproduced (os.getpgid(1) succeeds -> old code skips proc.kill). --- tests/hermes_cli/test_web_server.py | 11 +++++++++-- tests/tools/test_process_registry.py | 14 ++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/tests/hermes_cli/test_web_server.py b/tests/hermes_cli/test_web_server.py index 23dde91ad..cdc577d09 100644 --- a/tests/hermes_cli/test_web_server.py +++ b/tests/hermes_cli/test_web_server.py @@ -2195,7 +2195,7 @@ class TestPtyWebSocket: winsize_script = ( "import fcntl, struct, termios, time; " - "time.sleep(0.15); " + "time.sleep(0.5); " "rows, cols, *_ = struct.unpack('HHHH', " "fcntl.ioctl(0, termios.TIOCGWINSZ, b'\\0' * 8)); " "print(cols); print(rows)" @@ -2217,7 +2217,14 @@ class TestPtyWebSocket: deadline = time.monotonic() + 5.0 while time.monotonic() < deadline: - frame = conn.receive_bytes() + # receive_bytes() blocks; once the child prints its winsize and + # exits, the PTY closes and further reads raise. Without this + # guard a missed-marker run blocks until the 30s pytest-timeout + # (flaky failure) instead of failing fast on the assert below. + try: + frame = conn.receive_bytes() + except Exception: + break if frame: buf += frame if b"99" in buf and b"41" in buf: diff --git a/tests/tools/test_process_registry.py b/tests/tools/test_process_registry.py index b55637135..bc1ec06d6 100644 --- a/tests/tools/test_process_registry.py +++ b/tests/tools/test_process_registry.py @@ -561,9 +561,18 @@ class TestPopenLeakOnSetupFailure: def boom(*args, **kwargs): raise RuntimeError("Thread creation failed") + # proc.pid is a MagicMock-backed fake; os.getpgid(fake_pid) would query + # the real OS for an arbitrary PID. On a busy host that PID may exist, + # in which case spawn_local's primary cleanup path + # (os.killpg(os.getpgid(pid), SIGKILL)) succeeds against an UNRELATED + # real process group and proc.kill() is never reached — flaky failure, + # and a real risk of SIGKILLing an innocent process group. Force the + # ProcessLookupError fallback so the test deterministically exercises + # proc.kill() and never issues a real killpg. with patch("tools.process_registry._find_shell", return_value="/bin/bash"), \ patch("subprocess.Popen", return_value=proc), \ patch("threading.Thread", side_effect=boom), \ + patch("os.getpgid", side_effect=ProcessLookupError), \ patch.object(registry, "_write_checkpoint"): with pytest.raises(RuntimeError, match="Thread creation failed"): registry.spawn_local("echo hello", cwd="/tmp") @@ -588,9 +597,14 @@ class TestPopenLeakOnSetupFailure: fake_thread = MagicMock() + # See note in test_popen_killed_when_thread_creation_fails: force the + # ProcessLookupError fallback so cleanup deterministically calls + # proc.kill() instead of issuing a real os.killpg against whatever + # process group happens to own the fake PID on the host. with patch("tools.process_registry._find_shell", return_value="/bin/bash"), \ patch("subprocess.Popen", return_value=proc), \ patch("threading.Thread", return_value=fake_thread), \ + patch("os.getpgid", side_effect=ProcessLookupError), \ patch.object(registry, "_write_checkpoint", side_effect=OSError("disk full")): with pytest.raises(OSError, match="disk full"): registry.spawn_local("echo hello", cwd="/tmp")