fix(dashboard): clamp PTY resize dimensions for WSL2 winsize garbage (#38200)

* fix(dashboard): clamp PTY resize dimensions for WSL2 winsize garbage

WSL2 reports columns=131072, rows=1 from a broken winsize probe. The
dashboard /chat tab forwards xterm.js dimensions through PtyBridge.resize(),
which packs them as unsigned short via struct.pack. 131072 > 65535 raised
struct.error — uncaught (only OSError was handled) — breaking the resize
path and leaving the TUI laid out for a one-row, absurdly-wide screen, which
surfaces as blank/disappearing text.

Clamp cols/rows to a sane [1, 2000]x[1, 1000] range before packing.
Non-finite/non-integer probes fall back to the minimum so nothing can reach
struct.pack and raise.

* test(dashboard): de-flake pub/events broadcast test

test_pub_broadcasts_to_events_subscribers round-tripped a frame through
two nested Starlette TestClient WebSocket portals within a 10s wall-clock
budget. Under heavy parallel CI load a starved ASGI thread occasionally
blew that budget even though the server logic is correct, producing
intermittent 'broadcast not received within 10s' failures.

Drive _broadcast_event directly under asyncio with fake subscribers
instead. Same fan-out contract (verbatim delivery to every subscriber on
the channel, nothing to other channels), zero scheduling surface. Runs in
~0.3s, deterministic across 10 consecutive runs.
This commit is contained in:
Teknium
2026-06-03 09:00:16 -07:00
committed by GitHub
parent ecac659d7d
commit 9666305630
3 changed files with 156 additions and 55 deletions

View File

@ -50,6 +50,33 @@ except ImportError: # pragma: no cover - dev env without ptyprocess
__all__ = ["PtyBridge", "PtyUnavailableError"]
# ``struct winsize`` packs rows/cols as unsigned short (0..65535). We clamp
# well below that ceiling: real terminals never exceed a couple thousand
# columns, and a value above this is a broken probe (WSL2 reports
# columns=131072) rather than a genuine ultrawide. Lower bound is 1 — a
# zero/negative dimension is the classic "no size yet" signal.
_MIN_DIMENSION = 1
_MAX_COLS = 2000
_MAX_ROWS = 1000
def _clamp_dimension(value: int, maximum: int) -> int:
"""Clamp a reported terminal dimension into ``[_MIN_DIMENSION, maximum]``.
Non-integer / non-finite values fall back to ``_MIN_DIMENSION`` so a bad
probe can never reach ``struct.pack`` and raise ``struct.error``.
"""
try:
n = int(value)
except (TypeError, ValueError, OverflowError):
return _MIN_DIMENSION
if n < _MIN_DIMENSION:
return _MIN_DIMENSION
if n > maximum:
return maximum
return n
class PtyUnavailableError(RuntimeError):
"""Raised when a PTY cannot be created on this platform.
@ -189,11 +216,23 @@ class PtyBridge:
view = view[n:]
def resize(self, cols: int, rows: int) -> None:
"""Forward a terminal resize to the child via ``TIOCSWINSZ``."""
"""Forward a terminal resize to the child via ``TIOCSWINSZ``.
Dimensions are clamped to a sane range first. Some hosts report
garbage window sizes — the motivating case is WSL2, where xterm.js
in the dashboard ``/chat`` tab can pick up ``columns=131072,
rows=1`` from a broken winsize probe. ``struct winsize`` packs each
field as an unsigned short (max 65535), so an unclamped 131072 would
raise ``struct.error`` (not ``OSError``) and break the resize path,
leaving the TUI laid out for a one-row / absurdly-wide screen —
which is what shows up as blank / disappearing text.
"""
if self._closed:
return
cols = _clamp_dimension(cols, _MAX_COLS)
rows = _clamp_dimension(rows, _MAX_ROWS)
# struct winsize: rows, cols, xpixel, ypixel (all unsigned short)
winsize = struct.pack("HHHH", max(1, rows), max(1, cols), 0, 0)
winsize = struct.pack("HHHH", rows, cols, 0, 0)
try:
fcntl.ioctl(self._fd, termios.TIOCSWINSZ, winsize)
except OSError:

View File

@ -120,6 +120,72 @@ class TestPtyBridgeResize:
finally:
bridge.close()
def test_resize_clamps_wsl_garbage_dimensions(self):
# WSL2 reports columns=131072, rows=1 from a broken winsize probe.
# 131072 > 65535 (unsigned short max) used to raise struct.error in
# resize() — uncaught, since only OSError was handled — and broke the
# dashboard /chat resize path (blank/disappearing text). The clamp
# must coerce the width down to the sane max and never raise.
winsize_script = (
"import fcntl, struct, termios, time; "
"time.sleep(0.1); "
"rows, cols, *_ = struct.unpack('HHHH', "
"fcntl.ioctl(0, termios.TIOCGWINSZ, b'\\0' * 8)); "
"print(cols); print(rows)"
)
bridge = PtyBridge.spawn(
[sys.executable, "-c", winsize_script],
cols=80,
rows=24,
)
try:
# Must not raise struct.error.
bridge.resize(cols=131072, rows=1)
output = _read_until(bridge, b"\n", timeout=5.0)
# Width clamped to the sane maximum (2000), height floored to 1.
assert b"2000" in output
finally:
bridge.close()
@skip_on_windows
class TestClampDimension:
def test_clamps_above_max(self):
from hermes_cli.pty_bridge import _MAX_COLS, _MAX_ROWS, _clamp_dimension
assert _clamp_dimension(131072, _MAX_COLS) == _MAX_COLS
assert _clamp_dimension(131072, _MAX_ROWS) == _MAX_ROWS
def test_floors_at_one(self):
from hermes_cli.pty_bridge import _MAX_COLS, _clamp_dimension
assert _clamp_dimension(0, _MAX_COLS) == 1
assert _clamp_dimension(-5, _MAX_COLS) == 1
def test_passes_through_sane_values(self):
from hermes_cli.pty_bridge import _MAX_COLS, _clamp_dimension
assert _clamp_dimension(80, _MAX_COLS) == 80
assert _clamp_dimension(2000, _MAX_COLS) == 2000
def test_non_numeric_falls_back_to_min(self):
from hermes_cli.pty_bridge import _MAX_COLS, _clamp_dimension
assert _clamp_dimension(None, _MAX_COLS) == 1 # type: ignore[arg-type]
assert _clamp_dimension(float("nan"), _MAX_COLS) == 1 # type: ignore[arg-type]
assert _clamp_dimension(float("inf"), _MAX_COLS) == 1 # type: ignore[arg-type]
def test_clamped_values_pack_as_unsigned_short(self):
# The whole point: clamped output must never raise struct.error.
import struct as _struct
from hermes_cli.pty_bridge import _MAX_COLS, _MAX_ROWS, _clamp_dimension
cols = _clamp_dimension(131072, _MAX_COLS)
rows = _clamp_dimension(1, _MAX_ROWS)
# Should not raise.
_struct.pack("HHHH", rows, cols, 0, 0)
@skip_on_windows
class TestPtyBridgeClose:

View File

@ -3535,66 +3535,62 @@ class TestPtyWebSocket:
assert "channel=abc-123" in url
assert "token=" in url
def test_pub_broadcasts_to_events_subscribers(self, monkeypatch):
"""Frame written to /api/pub is rebroadcast verbatim to every
/api/events subscriber on the same channel."""
import time
from urllib.parse import urlencode
def test_pub_broadcasts_to_events_subscribers(self):
"""A frame handed to _broadcast_event is sent verbatim to every
subscriber registered on that channel — and not to subscribers on
other channels.
This drives the broadcast unit directly under asyncio rather than
round-tripping through Starlette's TestClient WebSocket portal. The
portal version was flaky under heavy parallel CI load: the broadcast
had to traverse two nested threaded portals within a 10s wall-clock
budget, and a starved ASGI thread occasionally blew that budget even
though the server logic was correct. Testing _broadcast_event with
fake subscribers removes the scheduling surface entirely while
asserting the exact fan-out contract.
"""
import asyncio
from hermes_cli import web_server as ws_mod
qs = urlencode({"token": self.token, "channel": "broadcast-test"})
pub_path = f"/api/pub?{qs}"
sub_path = f"/api/events?{qs}"
class _FakeSub:
def __init__(self):
self.sent: list[str] = []
with self.client.websocket_connect(sub_path) as sub:
# Wait for the subscriber to be registered on the server side.
# websocket_connect returns when ws.accept() completes, but the
# server adds us to ``_event_channels`` in a follow-up await,
# so a publish immediately after connect can race ahead of the
# subscriber registration and the message is dropped.
deadline = time.monotonic() + 5.0
while time.monotonic() < deadline:
if ws_mod.app.state.event_channels.get("broadcast-test"):
break
time.sleep(0.01)
else:
raise AssertionError(
"subscriber did not register on channel within 5s"
async def send_text(self, payload: str) -> None:
self.sent.append(payload)
app = ws_mod.app
async def _run():
sub_a1 = _FakeSub()
sub_a2 = _FakeSub()
sub_other = _FakeSub()
frame = '{"type":"tool.start","payload":{"tool_id":"t1"}}'
event_channels, event_lock = ws_mod._get_event_state(app)
# Register two subscribers on the target channel and one on a
# different channel, exactly as the /api/events handler does.
async with event_lock:
event_channels.setdefault("broadcast-test", set()).update(
{sub_a1, sub_a2}
)
event_channels.setdefault("other-channel", set()).add(sub_other)
try:
await ws_mod._broadcast_event(app, "broadcast-test", frame)
finally:
async with event_lock:
event_channels.pop("broadcast-test", None)
event_channels.pop("other-channel", None)
with self.client.websocket_connect(pub_path) as pub:
pub.send_text('{"type":"tool.start","payload":{"tool_id":"t1"}}')
# Yield control so the server-side broadcast handler can
# process the frame. TestClient runs the ASGI app in a
# background thread; a small sleep gives that thread time
# to call _broadcast_event before we start blocking on
# receive_text(). Without this, under heavy CI load the
# receive can race the broadcast and hang until
# pytest-timeout kills us.
import queue, threading
recv_q: queue.Queue = queue.Queue()
return sub_a1, sub_a2, sub_other, frame
def _recv():
try:
recv_q.put(sub.receive_text())
except Exception as exc:
recv_q.put(exc)
sub_a1, sub_a2, sub_other, frame = asyncio.run(_run())
t = threading.Thread(target=_recv, daemon=True)
t.start()
try:
received = recv_q.get(timeout=10.0)
except queue.Empty:
raise AssertionError(
"broadcast not received within 10s — server likely "
"dropped the frame silently (see _broadcast_event "
"except Exception: pass)"
)
if isinstance(received, Exception):
raise received
assert "tool.start" in received
assert '"tool_id":"t1"' in received
# Every subscriber on the channel got the frame verbatim, exactly once.
assert sub_a1.sent == [frame]
assert sub_a2.sent == [frame]
# A subscriber on a different channel got nothing.
assert sub_other.sent == []
def test_events_rejects_missing_channel(self):
from starlette.websockets import WebSocketDisconnect