diff --git a/hermes_cli/dashboard_auth/middleware.py b/hermes_cli/dashboard_auth/middleware.py index 0e25c1033..f3ad42a61 100644 --- a/hermes_cli/dashboard_auth/middleware.py +++ b/hermes_cli/dashboard_auth/middleware.py @@ -207,6 +207,22 @@ async def gated_auth_middleware( # good refresh token — defeating the whole transparent-refresh feature. session = None if at: + # Try every registered provider's verify_session in turn. A provider + # that doesn't recognise the token returns None and we move on; the + # first provider that returns a Session wins. + # + # A provider may instead raise ProviderError (its IDP/JWKS is + # unreachable, so it can neither confirm nor deny the token). With + # multiple providers stacked, that MUST NOT abort the chain — the + # token may belong to a *different*, reachable provider. (Concretely: + # a self-hosted-OIDC session hits the `nous` provider first, which + # tries to reach Nous Portal's JWKS; if that's unreachable it raises, + # but the `self-hosted` provider can still verify the token.) So we + # remember the unreachable error and keep going. Only if NO provider + # verifies the token AND at least one was unreachable do we surface a + # 503 — distinguishing "transient IDP outage" (don't force re-login) + # from "token genuinely invalid" (fall through to refresh/relogin). + unreachable_provider: str | None = None for provider in list_providers(): try: session = provider.verify_session(access_token=at) @@ -221,12 +237,19 @@ async def gated_auth_middleware( reason="provider_unreachable", ip=_client_ip(request), ) - return JSONResponse( - {"detail": f"Auth provider {provider.name!r} unreachable"}, - status_code=503, - ) + if unreachable_provider is None: + unreachable_provider = provider.name + continue if session is not None: break + if session is None and unreachable_provider is not None: + # No provider could verify the token and at least one couldn't be + # reached — treat as a transient outage rather than forcing a + # re-login through a (possibly also-unreachable) refresh. + return JSONResponse( + {"detail": f"Auth provider {unreachable_provider!r} unreachable"}, + status_code=503, + ) if session is None: # Access token is expired/invalid. Before forcing re-login, try to diff --git a/tests/hermes_cli/test_dashboard_auth_middleware.py b/tests/hermes_cli/test_dashboard_auth_middleware.py index cbbcc6d28..a7363a1db 100644 --- a/tests/hermes_cli/test_dashboard_auth_middleware.py +++ b/tests/hermes_cli/test_dashboard_auth_middleware.py @@ -340,3 +340,127 @@ def test_gated_zero_providers_login_page_renders_help_text(): finally: web_server.app.state.auth_required = prev_required web_server.app.state.bound_host = prev_host + + +# --------------------------------------------------------------------------- +# Multi-provider verify: a ProviderError from one provider must not abort the +# chain when another provider can verify the token. +# --------------------------------------------------------------------------- + + +class _UnreachableProvider(StubAuthProvider): + """A provider whose IDP is unreachable: verify_session always raises. + + Models the real-world bug — a self-hosted-OIDC session hits the ``nous`` + provider first, which tries to reach Nous Portal's JWKS; if that's + unreachable ``nous`` raises ProviderError. The gate must keep trying the + remaining providers rather than 503-ing the whole request. + """ + + name = "unreachable" + display_name = "Unreachable IdP (test only)" + + def verify_session(self, *, access_token: str): + from hermes_cli.dashboard_auth.base import ProviderError + + raise ProviderError("simulated: IDP/JWKS unreachable") + + def refresh_session(self, *, refresh_token: str): + from hermes_cli.dashboard_auth.base import ProviderError + + raise ProviderError("simulated: IDP/JWKS unreachable") + + +def _mint_stub_at(stub: StubAuthProvider) -> str: + """Mint a valid access-token cookie value from a StubAuthProvider via its + own login round trip (so the HMAC signature matches what verify expects).""" + ls = stub.start_login(redirect_uri="https://fly-app.fly.dev/auth/callback") + state = dict( + seg.split("=", 1) + for seg in ls.cookie_payload["hermes_session_pkce"].split(";") + if "=" in seg + )["state"] + verifier = dict( + seg.split("=", 1) + for seg in ls.cookie_payload["hermes_session_pkce"].split(";") + if "=" in seg + )["verifier"] + session = stub.complete_login( + code="stub_code", + state=state, + code_verifier=verifier, + redirect_uri="https://fly-app.fly.dev/auth/callback", + ) + return session.access_token + + +@pytest.fixture +def _gated_state(): + """Bare gated app-state setup WITHOUT registering any provider, so each + test controls provider registration order itself. Yields a factory that + builds the TestClient after providers are registered.""" + clear_providers() + prev_host = getattr(web_server.app.state, "bound_host", None) + prev_port = getattr(web_server.app.state, "bound_port", None) + prev_required = getattr(web_server.app.state, "auth_required", None) + web_server.app.state.bound_host = "fly-app.fly.dev" + web_server.app.state.bound_port = 443 + web_server.app.state.auth_required = True + + def _client() -> TestClient: + return TestClient(web_server.app, base_url="https://fly-app.fly.dev") + + yield _client + clear_providers() + web_server.app.state.bound_host = prev_host + web_server.app.state.bound_port = prev_port + web_server.app.state.auth_required = prev_required + + +def test_unreachable_first_provider_does_not_block_second(_gated_state): + """An unreachable provider registered FIRST must not 503 a request whose + token a later provider can verify. + + Regression for the stacked-provider bug: the verify loop used to return + 503 on the first provider's ProviderError, before the working provider + ever got a turn. Now it logs, continues, and the working provider wins. + """ + working = StubAuthProvider() + register_provider(_UnreachableProvider()) # registered first → tried first + register_provider(working) # the one that can verify + + at = _mint_stub_at(working) + client = _gated_state() + client.cookies.set(SESSION_AT_COOKIE, at) + r = client.get("/api/auth/me") + assert r.status_code == 200, ( + f"Expected the working provider to verify the session despite the " + f"unreachable one being tried first; got {r.status_code}: {r.text}" + ) + body = r.json() + assert body["provider"] == "stub" + assert body["user_id"] == "stub-user-1" + + +def test_all_providers_unreachable_returns_503(_gated_state): + """If NO provider can verify the token AND at least one was unreachable, + surface 503 (transient outage) rather than forcing a needless re-login.""" + register_provider(_UnreachableProvider()) + client = _gated_state() + # Any non-empty cookie — the unreachable provider raises before parsing. + client.cookies.set(SESSION_AT_COOKIE, "some-opaque-token") + r = client.get("/api/auth/me") + assert r.status_code == 503 + assert "unreachable" in r.text.lower() + + +def test_unverifiable_token_with_reachable_providers_redirects(_gated_state): + """When every provider is REACHABLE but none recognises the token (all + return None, none raises), the gate falls through to re-login — NOT 503.""" + register_provider(StubAuthProvider()) + client = _gated_state() + client.cookies.set(SESSION_AT_COOKIE, "garbage-not-a-real-token") + # API path → 401; HTML would 302. Either way, NOT 503. + r = client.get("/api/auth/me") + assert r.status_code == 401 + assert "unreachable" not in r.text.lower()