From 96663056309655bec290aedece7a10555e4c74f4 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Wed, 3 Jun 2026 09:00:16 -0700 Subject: [PATCH] fix(dashboard): clamp PTY resize dimensions for WSL2 winsize garbage (#38200) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(dashboard): clamp PTY resize dimensions for WSL2 winsize garbage WSL2 reports columns=131072, rows=1 from a broken winsize probe. The dashboard /chat tab forwards xterm.js dimensions through PtyBridge.resize(), which packs them as unsigned short via struct.pack. 131072 > 65535 raised struct.error — uncaught (only OSError was handled) — breaking the resize path and leaving the TUI laid out for a one-row, absurdly-wide screen, which surfaces as blank/disappearing text. Clamp cols/rows to a sane [1, 2000]x[1, 1000] range before packing. Non-finite/non-integer probes fall back to the minimum so nothing can reach struct.pack and raise. * test(dashboard): de-flake pub/events broadcast test test_pub_broadcasts_to_events_subscribers round-tripped a frame through two nested Starlette TestClient WebSocket portals within a 10s wall-clock budget. Under heavy parallel CI load a starved ASGI thread occasionally blew that budget even though the server logic is correct, producing intermittent 'broadcast not received within 10s' failures. Drive _broadcast_event directly under asyncio with fake subscribers instead. Same fan-out contract (verbatim delivery to every subscriber on the channel, nothing to other channels), zero scheduling surface. Runs in ~0.3s, deterministic across 10 consecutive runs. --- hermes_cli/pty_bridge.py | 43 +++++++++++- tests/hermes_cli/test_pty_bridge.py | 66 ++++++++++++++++++ tests/hermes_cli/test_web_server.py | 102 +++++++++++++--------------- 3 files changed, 156 insertions(+), 55 deletions(-) diff --git a/hermes_cli/pty_bridge.py b/hermes_cli/pty_bridge.py index a1779aa1d..511a3c39c 100644 --- a/hermes_cli/pty_bridge.py +++ b/hermes_cli/pty_bridge.py @@ -50,6 +50,33 @@ except ImportError: # pragma: no cover - dev env without ptyprocess __all__ = ["PtyBridge", "PtyUnavailableError"] +# ``struct winsize`` packs rows/cols as unsigned short (0..65535). We clamp +# well below that ceiling: real terminals never exceed a couple thousand +# columns, and a value above this is a broken probe (WSL2 reports +# columns=131072) rather than a genuine ultrawide. Lower bound is 1 — a +# zero/negative dimension is the classic "no size yet" signal. +_MIN_DIMENSION = 1 +_MAX_COLS = 2000 +_MAX_ROWS = 1000 + + +def _clamp_dimension(value: int, maximum: int) -> int: + """Clamp a reported terminal dimension into ``[_MIN_DIMENSION, maximum]``. + + Non-integer / non-finite values fall back to ``_MIN_DIMENSION`` so a bad + probe can never reach ``struct.pack`` and raise ``struct.error``. + """ + try: + n = int(value) + except (TypeError, ValueError, OverflowError): + return _MIN_DIMENSION + if n < _MIN_DIMENSION: + return _MIN_DIMENSION + if n > maximum: + return maximum + return n + + class PtyUnavailableError(RuntimeError): """Raised when a PTY cannot be created on this platform. @@ -189,11 +216,23 @@ class PtyBridge: view = view[n:] def resize(self, cols: int, rows: int) -> None: - """Forward a terminal resize to the child via ``TIOCSWINSZ``.""" + """Forward a terminal resize to the child via ``TIOCSWINSZ``. + + Dimensions are clamped to a sane range first. Some hosts report + garbage window sizes — the motivating case is WSL2, where xterm.js + in the dashboard ``/chat`` tab can pick up ``columns=131072, + rows=1`` from a broken winsize probe. ``struct winsize`` packs each + field as an unsigned short (max 65535), so an unclamped 131072 would + raise ``struct.error`` (not ``OSError``) and break the resize path, + leaving the TUI laid out for a one-row / absurdly-wide screen — + which is what shows up as blank / disappearing text. + """ if self._closed: return + cols = _clamp_dimension(cols, _MAX_COLS) + rows = _clamp_dimension(rows, _MAX_ROWS) # struct winsize: rows, cols, xpixel, ypixel (all unsigned short) - winsize = struct.pack("HHHH", max(1, rows), max(1, cols), 0, 0) + winsize = struct.pack("HHHH", rows, cols, 0, 0) try: fcntl.ioctl(self._fd, termios.TIOCSWINSZ, winsize) except OSError: diff --git a/tests/hermes_cli/test_pty_bridge.py b/tests/hermes_cli/test_pty_bridge.py index 4f366fd72..9ae007cc4 100644 --- a/tests/hermes_cli/test_pty_bridge.py +++ b/tests/hermes_cli/test_pty_bridge.py @@ -120,6 +120,72 @@ class TestPtyBridgeResize: finally: bridge.close() + def test_resize_clamps_wsl_garbage_dimensions(self): + # WSL2 reports columns=131072, rows=1 from a broken winsize probe. + # 131072 > 65535 (unsigned short max) used to raise struct.error in + # resize() — uncaught, since only OSError was handled — and broke the + # dashboard /chat resize path (blank/disappearing text). The clamp + # must coerce the width down to the sane max and never raise. + winsize_script = ( + "import fcntl, struct, termios, time; " + "time.sleep(0.1); " + "rows, cols, *_ = struct.unpack('HHHH', " + "fcntl.ioctl(0, termios.TIOCGWINSZ, b'\\0' * 8)); " + "print(cols); print(rows)" + ) + bridge = PtyBridge.spawn( + [sys.executable, "-c", winsize_script], + cols=80, + rows=24, + ) + try: + # Must not raise struct.error. + bridge.resize(cols=131072, rows=1) + output = _read_until(bridge, b"\n", timeout=5.0) + # Width clamped to the sane maximum (2000), height floored to 1. + assert b"2000" in output + finally: + bridge.close() + + +@skip_on_windows +class TestClampDimension: + def test_clamps_above_max(self): + from hermes_cli.pty_bridge import _MAX_COLS, _MAX_ROWS, _clamp_dimension + + assert _clamp_dimension(131072, _MAX_COLS) == _MAX_COLS + assert _clamp_dimension(131072, _MAX_ROWS) == _MAX_ROWS + + def test_floors_at_one(self): + from hermes_cli.pty_bridge import _MAX_COLS, _clamp_dimension + + assert _clamp_dimension(0, _MAX_COLS) == 1 + assert _clamp_dimension(-5, _MAX_COLS) == 1 + + def test_passes_through_sane_values(self): + from hermes_cli.pty_bridge import _MAX_COLS, _clamp_dimension + + assert _clamp_dimension(80, _MAX_COLS) == 80 + assert _clamp_dimension(2000, _MAX_COLS) == 2000 + + def test_non_numeric_falls_back_to_min(self): + from hermes_cli.pty_bridge import _MAX_COLS, _clamp_dimension + + assert _clamp_dimension(None, _MAX_COLS) == 1 # type: ignore[arg-type] + assert _clamp_dimension(float("nan"), _MAX_COLS) == 1 # type: ignore[arg-type] + assert _clamp_dimension(float("inf"), _MAX_COLS) == 1 # type: ignore[arg-type] + + def test_clamped_values_pack_as_unsigned_short(self): + # The whole point: clamped output must never raise struct.error. + import struct as _struct + + from hermes_cli.pty_bridge import _MAX_COLS, _MAX_ROWS, _clamp_dimension + + cols = _clamp_dimension(131072, _MAX_COLS) + rows = _clamp_dimension(1, _MAX_ROWS) + # Should not raise. + _struct.pack("HHHH", rows, cols, 0, 0) + @skip_on_windows class TestPtyBridgeClose: diff --git a/tests/hermes_cli/test_web_server.py b/tests/hermes_cli/test_web_server.py index 747954633..cac40c800 100644 --- a/tests/hermes_cli/test_web_server.py +++ b/tests/hermes_cli/test_web_server.py @@ -3535,66 +3535,62 @@ class TestPtyWebSocket: assert "channel=abc-123" in url assert "token=" in url - def test_pub_broadcasts_to_events_subscribers(self, monkeypatch): - """Frame written to /api/pub is rebroadcast verbatim to every - /api/events subscriber on the same channel.""" - import time - from urllib.parse import urlencode + def test_pub_broadcasts_to_events_subscribers(self): + """A frame handed to _broadcast_event is sent verbatim to every + subscriber registered on that channel — and not to subscribers on + other channels. + + This drives the broadcast unit directly under asyncio rather than + round-tripping through Starlette's TestClient WebSocket portal. The + portal version was flaky under heavy parallel CI load: the broadcast + had to traverse two nested threaded portals within a 10s wall-clock + budget, and a starved ASGI thread occasionally blew that budget even + though the server logic was correct. Testing _broadcast_event with + fake subscribers removes the scheduling surface entirely while + asserting the exact fan-out contract. + """ + import asyncio from hermes_cli import web_server as ws_mod - qs = urlencode({"token": self.token, "channel": "broadcast-test"}) - pub_path = f"/api/pub?{qs}" - sub_path = f"/api/events?{qs}" + class _FakeSub: + def __init__(self): + self.sent: list[str] = [] - with self.client.websocket_connect(sub_path) as sub: - # Wait for the subscriber to be registered on the server side. - # websocket_connect returns when ws.accept() completes, but the - # server adds us to ``_event_channels`` in a follow-up await, - # so a publish immediately after connect can race ahead of the - # subscriber registration and the message is dropped. - deadline = time.monotonic() + 5.0 - while time.monotonic() < deadline: - if ws_mod.app.state.event_channels.get("broadcast-test"): - break - time.sleep(0.01) - else: - raise AssertionError( - "subscriber did not register on channel within 5s" + async def send_text(self, payload: str) -> None: + self.sent.append(payload) + + app = ws_mod.app + + async def _run(): + sub_a1 = _FakeSub() + sub_a2 = _FakeSub() + sub_other = _FakeSub() + frame = '{"type":"tool.start","payload":{"tool_id":"t1"}}' + + event_channels, event_lock = ws_mod._get_event_state(app) + # Register two subscribers on the target channel and one on a + # different channel, exactly as the /api/events handler does. + async with event_lock: + event_channels.setdefault("broadcast-test", set()).update( + {sub_a1, sub_a2} ) + event_channels.setdefault("other-channel", set()).add(sub_other) + try: + await ws_mod._broadcast_event(app, "broadcast-test", frame) + finally: + async with event_lock: + event_channels.pop("broadcast-test", None) + event_channels.pop("other-channel", None) - with self.client.websocket_connect(pub_path) as pub: - pub.send_text('{"type":"tool.start","payload":{"tool_id":"t1"}}') - # Yield control so the server-side broadcast handler can - # process the frame. TestClient runs the ASGI app in a - # background thread; a small sleep gives that thread time - # to call _broadcast_event before we start blocking on - # receive_text(). Without this, under heavy CI load the - # receive can race the broadcast and hang until - # pytest-timeout kills us. - import queue, threading - recv_q: queue.Queue = queue.Queue() + return sub_a1, sub_a2, sub_other, frame - def _recv(): - try: - recv_q.put(sub.receive_text()) - except Exception as exc: - recv_q.put(exc) + sub_a1, sub_a2, sub_other, frame = asyncio.run(_run()) - t = threading.Thread(target=_recv, daemon=True) - t.start() - try: - received = recv_q.get(timeout=10.0) - except queue.Empty: - raise AssertionError( - "broadcast not received within 10s — server likely " - "dropped the frame silently (see _broadcast_event " - "except Exception: pass)" - ) - if isinstance(received, Exception): - raise received - - assert "tool.start" in received - assert '"tool_id":"t1"' in received + # Every subscriber on the channel got the frame verbatim, exactly once. + assert sub_a1.sent == [frame] + assert sub_a2.sent == [frame] + # A subscriber on a different channel got nothing. + assert sub_other.sent == [] def test_events_rejects_missing_channel(self): from starlette.websockets import WebSocketDisconnect