fix(dashboard): clamp PTY resize dimensions for WSL2 winsize garbage (#38200)
* fix(dashboard): clamp PTY resize dimensions for WSL2 winsize garbage WSL2 reports columns=131072, rows=1 from a broken winsize probe. The dashboard /chat tab forwards xterm.js dimensions through PtyBridge.resize(), which packs them as unsigned short via struct.pack. 131072 > 65535 raised struct.error — uncaught (only OSError was handled) — breaking the resize path and leaving the TUI laid out for a one-row, absurdly-wide screen, which surfaces as blank/disappearing text. Clamp cols/rows to a sane [1, 2000]x[1, 1000] range before packing. Non-finite/non-integer probes fall back to the minimum so nothing can reach struct.pack and raise. * test(dashboard): de-flake pub/events broadcast test test_pub_broadcasts_to_events_subscribers round-tripped a frame through two nested Starlette TestClient WebSocket portals within a 10s wall-clock budget. Under heavy parallel CI load a starved ASGI thread occasionally blew that budget even though the server logic is correct, producing intermittent 'broadcast not received within 10s' failures. Drive _broadcast_event directly under asyncio with fake subscribers instead. Same fan-out contract (verbatim delivery to every subscriber on the channel, nothing to other channels), zero scheduling surface. Runs in ~0.3s, deterministic across 10 consecutive runs.
This commit is contained in:
@ -50,6 +50,33 @@ except ImportError: # pragma: no cover - dev env without ptyprocess
|
||||
__all__ = ["PtyBridge", "PtyUnavailableError"]
|
||||
|
||||
|
||||
# ``struct winsize`` packs rows/cols as unsigned short (0..65535). We clamp
|
||||
# well below that ceiling: real terminals never exceed a couple thousand
|
||||
# columns, and a value above this is a broken probe (WSL2 reports
|
||||
# columns=131072) rather than a genuine ultrawide. Lower bound is 1 — a
|
||||
# zero/negative dimension is the classic "no size yet" signal.
|
||||
_MIN_DIMENSION = 1
|
||||
_MAX_COLS = 2000
|
||||
_MAX_ROWS = 1000
|
||||
|
||||
|
||||
def _clamp_dimension(value: int, maximum: int) -> int:
|
||||
"""Clamp a reported terminal dimension into ``[_MIN_DIMENSION, maximum]``.
|
||||
|
||||
Non-integer / non-finite values fall back to ``_MIN_DIMENSION`` so a bad
|
||||
probe can never reach ``struct.pack`` and raise ``struct.error``.
|
||||
"""
|
||||
try:
|
||||
n = int(value)
|
||||
except (TypeError, ValueError, OverflowError):
|
||||
return _MIN_DIMENSION
|
||||
if n < _MIN_DIMENSION:
|
||||
return _MIN_DIMENSION
|
||||
if n > maximum:
|
||||
return maximum
|
||||
return n
|
||||
|
||||
|
||||
class PtyUnavailableError(RuntimeError):
|
||||
"""Raised when a PTY cannot be created on this platform.
|
||||
|
||||
@ -189,11 +216,23 @@ class PtyBridge:
|
||||
view = view[n:]
|
||||
|
||||
def resize(self, cols: int, rows: int) -> None:
|
||||
"""Forward a terminal resize to the child via ``TIOCSWINSZ``."""
|
||||
"""Forward a terminal resize to the child via ``TIOCSWINSZ``.
|
||||
|
||||
Dimensions are clamped to a sane range first. Some hosts report
|
||||
garbage window sizes — the motivating case is WSL2, where xterm.js
|
||||
in the dashboard ``/chat`` tab can pick up ``columns=131072,
|
||||
rows=1`` from a broken winsize probe. ``struct winsize`` packs each
|
||||
field as an unsigned short (max 65535), so an unclamped 131072 would
|
||||
raise ``struct.error`` (not ``OSError``) and break the resize path,
|
||||
leaving the TUI laid out for a one-row / absurdly-wide screen —
|
||||
which is what shows up as blank / disappearing text.
|
||||
"""
|
||||
if self._closed:
|
||||
return
|
||||
cols = _clamp_dimension(cols, _MAX_COLS)
|
||||
rows = _clamp_dimension(rows, _MAX_ROWS)
|
||||
# struct winsize: rows, cols, xpixel, ypixel (all unsigned short)
|
||||
winsize = struct.pack("HHHH", max(1, rows), max(1, cols), 0, 0)
|
||||
winsize = struct.pack("HHHH", rows, cols, 0, 0)
|
||||
try:
|
||||
fcntl.ioctl(self._fd, termios.TIOCSWINSZ, winsize)
|
||||
except OSError:
|
||||
|
||||
@ -120,6 +120,72 @@ class TestPtyBridgeResize:
|
||||
finally:
|
||||
bridge.close()
|
||||
|
||||
def test_resize_clamps_wsl_garbage_dimensions(self):
|
||||
# WSL2 reports columns=131072, rows=1 from a broken winsize probe.
|
||||
# 131072 > 65535 (unsigned short max) used to raise struct.error in
|
||||
# resize() — uncaught, since only OSError was handled — and broke the
|
||||
# dashboard /chat resize path (blank/disappearing text). The clamp
|
||||
# must coerce the width down to the sane max and never raise.
|
||||
winsize_script = (
|
||||
"import fcntl, struct, termios, time; "
|
||||
"time.sleep(0.1); "
|
||||
"rows, cols, *_ = struct.unpack('HHHH', "
|
||||
"fcntl.ioctl(0, termios.TIOCGWINSZ, b'\\0' * 8)); "
|
||||
"print(cols); print(rows)"
|
||||
)
|
||||
bridge = PtyBridge.spawn(
|
||||
[sys.executable, "-c", winsize_script],
|
||||
cols=80,
|
||||
rows=24,
|
||||
)
|
||||
try:
|
||||
# Must not raise struct.error.
|
||||
bridge.resize(cols=131072, rows=1)
|
||||
output = _read_until(bridge, b"\n", timeout=5.0)
|
||||
# Width clamped to the sane maximum (2000), height floored to 1.
|
||||
assert b"2000" in output
|
||||
finally:
|
||||
bridge.close()
|
||||
|
||||
|
||||
@skip_on_windows
|
||||
class TestClampDimension:
|
||||
def test_clamps_above_max(self):
|
||||
from hermes_cli.pty_bridge import _MAX_COLS, _MAX_ROWS, _clamp_dimension
|
||||
|
||||
assert _clamp_dimension(131072, _MAX_COLS) == _MAX_COLS
|
||||
assert _clamp_dimension(131072, _MAX_ROWS) == _MAX_ROWS
|
||||
|
||||
def test_floors_at_one(self):
|
||||
from hermes_cli.pty_bridge import _MAX_COLS, _clamp_dimension
|
||||
|
||||
assert _clamp_dimension(0, _MAX_COLS) == 1
|
||||
assert _clamp_dimension(-5, _MAX_COLS) == 1
|
||||
|
||||
def test_passes_through_sane_values(self):
|
||||
from hermes_cli.pty_bridge import _MAX_COLS, _clamp_dimension
|
||||
|
||||
assert _clamp_dimension(80, _MAX_COLS) == 80
|
||||
assert _clamp_dimension(2000, _MAX_COLS) == 2000
|
||||
|
||||
def test_non_numeric_falls_back_to_min(self):
|
||||
from hermes_cli.pty_bridge import _MAX_COLS, _clamp_dimension
|
||||
|
||||
assert _clamp_dimension(None, _MAX_COLS) == 1 # type: ignore[arg-type]
|
||||
assert _clamp_dimension(float("nan"), _MAX_COLS) == 1 # type: ignore[arg-type]
|
||||
assert _clamp_dimension(float("inf"), _MAX_COLS) == 1 # type: ignore[arg-type]
|
||||
|
||||
def test_clamped_values_pack_as_unsigned_short(self):
|
||||
# The whole point: clamped output must never raise struct.error.
|
||||
import struct as _struct
|
||||
|
||||
from hermes_cli.pty_bridge import _MAX_COLS, _MAX_ROWS, _clamp_dimension
|
||||
|
||||
cols = _clamp_dimension(131072, _MAX_COLS)
|
||||
rows = _clamp_dimension(1, _MAX_ROWS)
|
||||
# Should not raise.
|
||||
_struct.pack("HHHH", rows, cols, 0, 0)
|
||||
|
||||
|
||||
@skip_on_windows
|
||||
class TestPtyBridgeClose:
|
||||
|
||||
@ -3535,66 +3535,62 @@ class TestPtyWebSocket:
|
||||
assert "channel=abc-123" in url
|
||||
assert "token=" in url
|
||||
|
||||
def test_pub_broadcasts_to_events_subscribers(self, monkeypatch):
|
||||
"""Frame written to /api/pub is rebroadcast verbatim to every
|
||||
/api/events subscriber on the same channel."""
|
||||
import time
|
||||
from urllib.parse import urlencode
|
||||
def test_pub_broadcasts_to_events_subscribers(self):
|
||||
"""A frame handed to _broadcast_event is sent verbatim to every
|
||||
subscriber registered on that channel — and not to subscribers on
|
||||
other channels.
|
||||
|
||||
This drives the broadcast unit directly under asyncio rather than
|
||||
round-tripping through Starlette's TestClient WebSocket portal. The
|
||||
portal version was flaky under heavy parallel CI load: the broadcast
|
||||
had to traverse two nested threaded portals within a 10s wall-clock
|
||||
budget, and a starved ASGI thread occasionally blew that budget even
|
||||
though the server logic was correct. Testing _broadcast_event with
|
||||
fake subscribers removes the scheduling surface entirely while
|
||||
asserting the exact fan-out contract.
|
||||
"""
|
||||
import asyncio
|
||||
from hermes_cli import web_server as ws_mod
|
||||
|
||||
qs = urlencode({"token": self.token, "channel": "broadcast-test"})
|
||||
pub_path = f"/api/pub?{qs}"
|
||||
sub_path = f"/api/events?{qs}"
|
||||
class _FakeSub:
|
||||
def __init__(self):
|
||||
self.sent: list[str] = []
|
||||
|
||||
with self.client.websocket_connect(sub_path) as sub:
|
||||
# Wait for the subscriber to be registered on the server side.
|
||||
# websocket_connect returns when ws.accept() completes, but the
|
||||
# server adds us to ``_event_channels`` in a follow-up await,
|
||||
# so a publish immediately after connect can race ahead of the
|
||||
# subscriber registration and the message is dropped.
|
||||
deadline = time.monotonic() + 5.0
|
||||
while time.monotonic() < deadline:
|
||||
if ws_mod.app.state.event_channels.get("broadcast-test"):
|
||||
break
|
||||
time.sleep(0.01)
|
||||
else:
|
||||
raise AssertionError(
|
||||
"subscriber did not register on channel within 5s"
|
||||
async def send_text(self, payload: str) -> None:
|
||||
self.sent.append(payload)
|
||||
|
||||
app = ws_mod.app
|
||||
|
||||
async def _run():
|
||||
sub_a1 = _FakeSub()
|
||||
sub_a2 = _FakeSub()
|
||||
sub_other = _FakeSub()
|
||||
frame = '{"type":"tool.start","payload":{"tool_id":"t1"}}'
|
||||
|
||||
event_channels, event_lock = ws_mod._get_event_state(app)
|
||||
# Register two subscribers on the target channel and one on a
|
||||
# different channel, exactly as the /api/events handler does.
|
||||
async with event_lock:
|
||||
event_channels.setdefault("broadcast-test", set()).update(
|
||||
{sub_a1, sub_a2}
|
||||
)
|
||||
event_channels.setdefault("other-channel", set()).add(sub_other)
|
||||
try:
|
||||
await ws_mod._broadcast_event(app, "broadcast-test", frame)
|
||||
finally:
|
||||
async with event_lock:
|
||||
event_channels.pop("broadcast-test", None)
|
||||
event_channels.pop("other-channel", None)
|
||||
|
||||
with self.client.websocket_connect(pub_path) as pub:
|
||||
pub.send_text('{"type":"tool.start","payload":{"tool_id":"t1"}}')
|
||||
# Yield control so the server-side broadcast handler can
|
||||
# process the frame. TestClient runs the ASGI app in a
|
||||
# background thread; a small sleep gives that thread time
|
||||
# to call _broadcast_event before we start blocking on
|
||||
# receive_text(). Without this, under heavy CI load the
|
||||
# receive can race the broadcast and hang until
|
||||
# pytest-timeout kills us.
|
||||
import queue, threading
|
||||
recv_q: queue.Queue = queue.Queue()
|
||||
return sub_a1, sub_a2, sub_other, frame
|
||||
|
||||
def _recv():
|
||||
try:
|
||||
recv_q.put(sub.receive_text())
|
||||
except Exception as exc:
|
||||
recv_q.put(exc)
|
||||
sub_a1, sub_a2, sub_other, frame = asyncio.run(_run())
|
||||
|
||||
t = threading.Thread(target=_recv, daemon=True)
|
||||
t.start()
|
||||
try:
|
||||
received = recv_q.get(timeout=10.0)
|
||||
except queue.Empty:
|
||||
raise AssertionError(
|
||||
"broadcast not received within 10s — server likely "
|
||||
"dropped the frame silently (see _broadcast_event "
|
||||
"except Exception: pass)"
|
||||
)
|
||||
if isinstance(received, Exception):
|
||||
raise received
|
||||
|
||||
assert "tool.start" in received
|
||||
assert '"tool_id":"t1"' in received
|
||||
# Every subscriber on the channel got the frame verbatim, exactly once.
|
||||
assert sub_a1.sent == [frame]
|
||||
assert sub_a2.sent == [frame]
|
||||
# A subscriber on a different channel got nothing.
|
||||
assert sub_other.sent == []
|
||||
|
||||
def test_events_rejects_missing_channel(self):
|
||||
from starlette.websockets import WebSocketDisconnect
|
||||
|
||||
Reference in New Issue
Block a user