From fd1ec8033ddac8f7e3a02b21db75f1b92ebc6ac6 Mon Sep 17 00:00:00 2001 From: Ben Date: Wed, 3 Jun 2026 14:48:58 +1000 Subject: [PATCH] fix(dashboard): authenticate server-spawned PTY child WS with a process-internal credential MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The embedded-TUI PTY child attaches to two server-internal WebSockets: /api/ws (its primary JSON-RPC gateway backend) and /api/pub (the event sidecar). Both URLs are built server-side in web_server.py and handed to the child via its environment. In OAuth-gated mode (auth_required=true, every hosted Fly agent), _ws_auth_ok unconditionally rejects the legacy ?token=<_SESSION_TOKEN> path — a leaked session token must not grant WS access once the gate is engaged. But _build_gateway_ws_url() still only emitted ?token=, with no gated-mode branch (its sibling _build_sidecar_url had been given a ticket branch; the gateway-url builder was missed). So the TUI child's /api/ws upgrade was rejected 4401 -> 'gateway websocket connection failed' -> 'gateway startup timeout', leaving the embedded chat unusable on every gated deployment. A single-use 30s browser ticket is the wrong shape for this link: the child reads its attach URL once at startup and reuses it on every reconnect, and on a slow cold boot it may not dial within the TTL. (_build_sidecar_url's own docstring already flagged this fragility.) Fix: add a process-lifetime, multi-use internal credential to dashboard_auth.ws_tickets (internal_ws_credential / consume_internal_credential), minted once per process and NEVER injected into the SPA — it only leaves the process via a spawned child's env, so browser-side XSS can't read it, and a leak grants no more than a ticket already does. _ws_auth_ok accepts it via ?internal= in gated mode only. Both _build_gateway_ws_url and _build_sidecar_url now use it, so the child can reconnect both sockets. Loopback / --insecure behavior is unchanged (still ?token=). Needs review: touches _ws_auth_ok + dashboard_auth (core auth surface). --- hermes_cli/dashboard_auth/ws_tickets.py | 92 ++++++++++++++++-- hermes_cli/web_server.py | 91 +++++++++++++----- .../hermes_cli/test_dashboard_auth_ws_auth.py | 95 ++++++++++++++++--- 3 files changed, 233 insertions(+), 45 deletions(-) diff --git a/hermes_cli/dashboard_auth/ws_tickets.py b/hermes_cli/dashboard_auth/ws_tickets.py index 6ebad217e..a7e46dccf 100644 --- a/hermes_cli/dashboard_auth/ws_tickets.py +++ b/hermes_cli/dashboard_auth/ws_tickets.py @@ -1,16 +1,32 @@ -"""Short-lived single-use tickets for WS-upgrade auth in gated mode. +"""WS-upgrade auth credentials for gated mode. Browsers cannot set ``Authorization`` on a WebSocket upgrade. In loopback mode the legacy ``?token=<_SESSION_TOKEN>`` query param works because the token is injected into the SPA bundle. In gated mode there is no injected -token — the SPA gets a fresh ticket via the authenticated REST endpoint -``POST /api/auth/ws-ticket`` and passes that as ``?ticket=`` on the -WS upgrade. +token — so this module provides two credential shapes: -Tickets are single-use, TTL = 30 seconds. In-memory; the dashboard is a -single process so no distributed coordination is needed. The module -exposes a small functional API rather than a class so tests can patch -``time.time`` cleanly. +1. **Single-use browser tickets** (``mint_ticket`` / ``consume_ticket``). + The SPA gets a fresh ticket via the authenticated REST endpoint + ``POST /api/auth/ws-ticket`` and passes it as ``?ticket=`` on the WS + upgrade. Single-use, TTL = 30 seconds — a leaked ticket is uninteresting. + +2. **A process-lifetime internal credential** (``internal_ws_credential`` / + ``consume_internal_credential``). This authenticates *server-spawned* + WS clients — specifically the embedded-TUI PTY child, which attaches to + ``/api/ws`` (JSON-RPC gateway) and ``/api/pub`` (event sidecar) over + loopback. A single-use 30s ticket is the wrong shape for that link: the + child reads its attach URL once at startup and **reuses it on every + reconnect**, and on a slow cold boot the child may not dial within 30s. + The internal credential is minted once per process, never expires, is + multi-use, and — critically — is **never injected into any HTML/SPA**: + it only ever leaves the process via the spawned child's environment, so + browser-side XSS cannot read it. A leaked internal credential grants no + more than a single-use ticket already does (the same two internal WS + endpoints), and the same Origin / host guards still apply downstream. + +In-memory; the dashboard is a single process so no distributed coordination +is needed. The module exposes a small functional API rather than a class so +tests can patch ``time.time`` cleanly. """ from __future__ import annotations @@ -18,7 +34,7 @@ from __future__ import annotations import secrets import threading import time -from typing import Any, Dict, Tuple +from typing import Any, Dict, Optional, Tuple #: Time-to-live for newly-minted tickets in seconds. 30 s is long enough #: that the SPA can call ``getWsTicket()`` and immediately open the WS, @@ -28,6 +44,16 @@ TTL_SECONDS = 30 _lock = threading.Lock() _tickets: Dict[str, Tuple[int, Dict[str, Any]]] = {} # ticket -> (expires_at, info) +#: The process-lifetime internal credential (see module docstring). Lazily +#: minted on first ``internal_ws_credential()`` call and stable for the life +#: of the process. Guarded by ``_lock``. +_internal_credential: Optional[str] = None + +#: Identity recorded for connections that authenticate via the internal +#: credential, so audit logs distinguish them from browser-initiated tickets. +INTERNAL_USER_ID = "server-internal" +INTERNAL_PROVIDER = "server-internal" + class TicketInvalid(Exception): """Ticket missing, expired, or already consumed.""" @@ -81,7 +107,53 @@ def _gc_expired_locked() -> None: _tickets.pop(t, None) +def internal_ws_credential() -> str: + """Return the process-lifetime internal WS credential, minting it once. + + Used by the server to authenticate WS clients it spawns itself (the + embedded-TUI PTY child). The value is stable for the life of the process, + multi-use, and never expires — so a server-spawned child can reconnect + its ``/api/ws`` / ``/api/pub`` sockets indefinitely without re-minting. + + The credential is never injected into the SPA HTML or returned over any + REST endpoint; it is only ever passed to a child process via its + environment. See the module docstring for the threat-model rationale. + """ + global _internal_credential + with _lock: + if _internal_credential is None: + _internal_credential = secrets.token_urlsafe(32) + return _internal_credential + + +def consume_internal_credential(value: str) -> Dict[str, Any]: + """Validate an internal credential. Raises :class:`TicketInvalid` on mismatch. + + Unlike :func:`consume_ticket` this is **not** single-use — the value is + not removed on success, so a server-spawned child can present it on every + (re)connect. Returns the fixed server-internal identity ``info`` dict so + the WS handler can carry it into its session log, mirroring the shape + ``consume_ticket`` returns. + + A constant-time compare against the (lazily-minted) credential avoids + leaking length / prefix information on mismatch. If no internal + credential has been minted yet, any value is rejected. + """ + with _lock: + expected = _internal_credential + if not value or expected is None: + raise TicketInvalid("no internal credential") + if not secrets.compare_digest(value.encode(), expected.encode()): + raise TicketInvalid("internal credential mismatch") + return { + "user_id": INTERNAL_USER_ID, + "provider": INTERNAL_PROVIDER, + } + + def _reset_for_tests() -> None: - """Test-only: drop all tickets.""" + """Test-only: drop all tickets and the internal credential.""" + global _internal_credential with _lock: _tickets.clear() + _internal_credential = None diff --git a/hermes_cli/web_server.py b/hermes_cli/web_server.py index 1d647b7ed..ea0cdd23f 100644 --- a/hermes_cli/web_server.py +++ b/hermes_cli/web_server.py @@ -6643,10 +6643,21 @@ def _ws_auth_ok(ws: "WebSocket") -> bool: Loopback / ``--insecure``: legacy ``?token=<_SESSION_TOKEN>`` query parameter, constant-time compared. - Gated (public bind, no ``--insecure``): ``?ticket=`` query - parameter consumed against the dashboard-auth ticket store. The legacy - token path is unconditionally rejected in this mode (the SPA bundle - isn't carrying the token any longer). + Gated (public bind, no ``--insecure``): one of two credentials — + + * ``?ticket=`` — a browser-minted, single-use, 30s-TTL ticket + consumed against the dashboard-auth ticket store. This is what the SPA + (and native clients) use. + * ``?internal=`` — the process-lifetime internal + credential, used only by WS clients the server spawns itself (the + embedded-TUI PTY child attaching to ``/api/ws`` and ``/api/pub``). It + is multi-use and never expires so the child can reconnect, and is never + injected into the SPA — see ``dashboard_auth.ws_tickets`` for the + threat model. + + The legacy ``?token=`` path is unconditionally rejected in gated mode + (the SPA bundle isn't carrying the token any longer, and a leaked + ``_SESSION_TOKEN`` must not grant WS access once the gate is engaged). Returns True if the WS should be accepted; callers close with the appropriate WS code (4401) on False. Audit-logs the rejection so @@ -6654,17 +6665,36 @@ def _ws_auth_ok(ws: "WebSocket") -> bool: """ auth_required = bool(getattr(app.state, "auth_required", False)) if auth_required: - ticket = ws.query_params.get("ticket", "") - if not ticket: - return False # Lazy import — keeps this function importable in test harnesses # that don't bring in the dashboard_auth layer. from hermes_cli.dashboard_auth.audit import AuditEvent, audit_log from hermes_cli.dashboard_auth.ws_tickets import ( TicketInvalid, + consume_internal_credential, consume_ticket, ) + # Server-spawned children (PTY child → /api/ws, /api/pub) present the + # multi-use internal credential rather than a single-use ticket, so + # they survive reconnects and slow cold boots. + internal = ws.query_params.get("internal", "") + if internal: + try: + consume_internal_credential(internal) + return True + except TicketInvalid as exc: + audit_log( + AuditEvent.WS_TICKET_REJECTED, + reason=f"internal: {exc}", + ip=(ws.client.host if ws.client else ""), + path=ws.url.path, + ) + return False + + ticket = ws.query_params.get("ticket", "") + if not ticket: + return False + try: consume_ticket(ticket) return True @@ -6740,7 +6770,16 @@ def _resolve_chat_argv( def _build_gateway_ws_url() -> Optional[str]: - """ws:// URL the PTY child should attach to for JSON-RPC gateway traffic.""" + """ws:// URL the PTY child should attach to for JSON-RPC gateway traffic. + + Loopback / ``--insecure``: ``?token=<_SESSION_TOKEN>``. + + Gated mode: the legacy token path is rejected by ``_ws_auth_ok``, so the + server-spawned PTY child authenticates with the process-lifetime internal + credential (``?internal=``). It must NOT use a single-use browser ticket: + the child reads this URL once at startup and reuses it on every reconnect, + and a 30s-TTL ticket can expire before a slow cold boot even dials. + """ host = getattr(app.state, "bound_host", None) port = getattr(app.state, "bound_port", None) @@ -6752,7 +6791,13 @@ def _build_gateway_ws_url() -> Optional[str]: if ":" in host and not host.startswith("[") else f"{host}:{port}" ) - qs = urllib.parse.urlencode({"token": _SESSION_TOKEN}) + + if getattr(app.state, "auth_required", False): + from hermes_cli.dashboard_auth.ws_tickets import internal_ws_credential + + qs = urllib.parse.urlencode({"internal": internal_ws_credential()}) + else: + qs = urllib.parse.urlencode({"token": _SESSION_TOKEN}) return f"ws://{netloc}/api/ws?{qs}" @@ -6762,16 +6807,14 @@ def _build_sidecar_url(channel: str) -> Optional[str]: Loopback / ``--insecure``: uses ``?token=<_SESSION_TOKEN>``. - Gated mode: mints a single-use ticket via the dashboard-auth ticket - store (server-side mint, no HTTP round trip — the PTY child is a - server-spawned process and we trust it). The ticket binds to the - pseudo-user ``"pty-sidecar"`` so audit logs can distinguish these from - browser-initiated tickets. - - The single-use lifetime means the PTY child cannot reconnect without a - new sidecar URL. PTY children open ``/api/pub`` once at startup; if - reconnect semantics ever become important, this should be upgraded to - a long-lived process-scoped token. + Gated mode: authenticates with the process-lifetime internal credential + (``?internal=``), the same one ``_build_gateway_ws_url`` uses. The PTY + child is a server-spawned process we trust; the credential is multi-use + and never expires, so the child can reconnect ``/api/pub`` without a new + URL. (This previously minted a single-use 30s ticket, which meant the + child could not reconnect and could miss the window on a slow cold boot.) + Connections authenticated this way are recorded under the + ``server-internal`` identity in the audit log. """ host = getattr(app.state, "bound_host", None) port = getattr(app.state, "bound_port", None) @@ -6782,11 +6825,13 @@ def _build_sidecar_url(channel: str) -> Optional[str]: netloc = f"[{host}]:{port}" if ":" in host and not host.startswith("[") else f"{host}:{port}" if getattr(app.state, "auth_required", False): - # Gated mode — mint a ticket so the WS upgrade survives _ws_auth_ok. - from hermes_cli.dashboard_auth.ws_tickets import mint_ticket + # Gated mode — use the internal credential so the WS upgrade survives + # _ws_auth_ok and the child can reconnect. + from hermes_cli.dashboard_auth.ws_tickets import internal_ws_credential - ticket = mint_ticket(user_id="pty-sidecar", provider="server-internal") - qs = urllib.parse.urlencode({"ticket": ticket, "channel": channel}) + qs = urllib.parse.urlencode( + {"internal": internal_ws_credential(), "channel": channel} + ) else: qs = urllib.parse.urlencode({"token": _SESSION_TOKEN, "channel": channel}) diff --git a/tests/hermes_cli/test_dashboard_auth_ws_auth.py b/tests/hermes_cli/test_dashboard_auth_ws_auth.py index df16dd000..d4f9dbbdd 100644 --- a/tests/hermes_cli/test_dashboard_auth_ws_auth.py +++ b/tests/hermes_cli/test_dashboard_auth_ws_auth.py @@ -29,7 +29,8 @@ from hermes_cli import web_server from hermes_cli.dashboard_auth import clear_providers, register_provider from hermes_cli.dashboard_auth.ws_tickets import ( _reset_for_tests, - consume_ticket, + consume_internal_credential, + internal_ws_credential, mint_ticket, ) from tests.hermes_cli.conftest_dashboard_auth import StubAuthProvider @@ -279,10 +280,33 @@ class TestWsAuthOkGated: content = log_file.read_text() assert "ws_ticket_rejected" in content + def test_internal_credential_accepted(self, gated_app): + """Server-spawned children present the process-lifetime internal + credential via ?internal= and are accepted in gated mode.""" + cred = internal_ws_credential() + ws = _fake_ws(query={"internal": cred}) + assert web_server._ws_auth_ok(ws) is True -# --------------------------------------------------------------------------- -# _build_sidecar_url — gated mode mints a server-internal ticket -# --------------------------------------------------------------------------- + def test_internal_credential_is_multi_use(self, gated_app): + """Unlike single-use tickets, the internal credential survives + repeated use so the child can reconnect.""" + cred = internal_ws_credential() + for _ in range(3): + ws = _fake_ws(query={"internal": cred}) + assert web_server._ws_auth_ok(ws) is True + + def test_wrong_internal_credential_rejected(self, gated_app): + # Mint the real one so the store is non-empty, then present a bogus value. + internal_ws_credential() + ws = _fake_ws(query={"internal": "not-the-internal-credential"}) + assert web_server._ws_auth_ok(ws) is False + + def test_internal_credential_not_accepted_in_loopback(self, loopback_app): + """Outside gated mode, ?internal= is meaningless — only ?token= works. + A naked internal credential must not authenticate.""" + cred = internal_ws_credential() + ws = _fake_ws(query={"internal": cred}) + assert web_server._ws_auth_ok(ws) is False class TestWsRequestIsAllowedGated: @@ -477,18 +501,20 @@ class TestSidecarUrl: assert f"token={web_server._SESSION_TOKEN}" in url assert "ticket=" not in url - def test_gated_uses_ticket(self, gated_app): + def test_gated_uses_internal_credential(self, gated_app): url = web_server._build_sidecar_url("ch-1") assert url is not None assert "token=" not in url - assert "ticket=" in url - # And the ticket should be live. - ticket = url.split("ticket=")[1].split("&")[0] - info = consume_ticket(ticket) - # Sidecar tickets are bound to the pseudo-user so audit logs can - # distinguish them from real browser tickets. - assert info["user_id"] == "pty-sidecar" + assert "ticket=" not in url + assert "internal=" in url + # The value should be the live process-lifetime internal credential, + # multi-use so the child can reconnect /api/pub. + cred = url.split("internal=")[1].split("&")[0] + info = consume_internal_credential(cred) + assert info["user_id"] == "server-internal" assert info["provider"] == "server-internal" + # Multi-use: a second consume still succeeds (unlike a ticket). + assert consume_internal_credential(cred)["provider"] == "server-internal" def test_no_bound_host_returns_none(self, gated_app): web_server.app.state.bound_host = None @@ -496,3 +522,48 @@ class TestSidecarUrl: assert web_server._build_sidecar_url("ch") is None finally: web_server.app.state.bound_host = "fly-app.fly.dev" + + +# --------------------------------------------------------------------------- +# _build_gateway_ws_url — the TUI child's primary JSON-RPC backend WS. +# Loopback uses ?token=; gated mode uses the multi-use internal credential +# (NOT a single-use ticket — the child reuses this URL across reconnects). +# --------------------------------------------------------------------------- + + +class TestGatewayWsUrl: + def test_loopback_uses_session_token(self, loopback_app): + url = web_server._build_gateway_ws_url() + assert url is not None + assert "/api/ws?" in url + assert f"token={web_server._SESSION_TOKEN}" in url + assert "internal=" not in url + + def test_gated_uses_internal_credential(self, gated_app): + url = web_server._build_gateway_ws_url() + assert url is not None + assert "/api/ws?" in url + assert "token=" not in url + assert "ticket=" not in url + assert "internal=" in url + cred = url.split("internal=")[1].split("&")[0] + # The credential authenticates against _ws_auth_ok in gated mode. + ws = _fake_ws(query={"internal": cred}) + assert web_server._ws_auth_ok(ws) is True + + def test_gated_credential_matches_sidecar(self, gated_app): + """Both server-internal builders share one process credential, so a + single value authenticates /api/ws and /api/pub alike.""" + gw = web_server._build_gateway_ws_url() + sc = web_server._build_sidecar_url("ch-1") + assert gw is not None and sc is not None + gw_cred = gw.split("internal=")[1].split("&")[0] + sc_cred = sc.split("internal=")[1].split("&")[0] + assert gw_cred == sc_cred + + def test_no_bound_host_returns_none(self, gated_app): + web_server.app.state.bound_host = None + try: + assert web_server._build_gateway_ws_url() is None + finally: + web_server.app.state.bound_host = "fly-app.fly.dev"