fix(dashboard-auth): don't abort verify chain on one provider's ProviderError

The gated dashboard verifies a session cookie by trying each registered
DashboardAuthProvider's verify_session in turn (the session cookie stores
only the access token, not which provider issued it). A provider that
doesn't recognise a token returns None; a provider whose IDP/JWKS is
unreachable raises ProviderError.

The loop used to return HTTP 503 on the FIRST ProviderError, before any
later provider got a turn. With multiple providers stacked, that means an
unreachable IDP for a session you didn't even use blocks login through a
different, reachable provider.

Concrete repro: a self-hosted-OIDC session hits the 'nous' provider first
(registered earlier); nous tries to reach Nous Portal's JWKS, which is
unreachable in a self-hosted deployment, so it raises — and the gate
503s before the 'self-hosted' provider can verify the token. Hit live
while testing the new self-hosted OIDC plugin against a local Keycloak.

Fix: a ProviderError from one provider is logged and the loop continues
to the next. A 503 is returned only if NO provider verified the token
AND at least one was unreachable — distinguishing a transient IDP outage
(don't force a needless re-login) from a token that's genuinely invalid
(fall through to refresh/relogin). Single-provider behaviour is
unchanged.

Tests: adds an _UnreachableProvider stub and three cases — unreachable
provider first must not block a working second; all-unreachable still
503s; reachable-but-unrecognised falls through to 401/relogin (not 503).
Mutation-tested: reverting the fix makes the first case fail with the
exact 503 bug.
This commit is contained in:
Ben
2026-06-04 17:02:43 +10:00
committed by Teknium
parent f57ce341dc
commit 616c0a36b6
2 changed files with 151 additions and 4 deletions

View File

@ -207,6 +207,22 @@ async def gated_auth_middleware(
# good refresh token — defeating the whole transparent-refresh feature.
session = None
if at:
# Try every registered provider's verify_session in turn. A provider
# that doesn't recognise the token returns None and we move on; the
# first provider that returns a Session wins.
#
# A provider may instead raise ProviderError (its IDP/JWKS is
# unreachable, so it can neither confirm nor deny the token). With
# multiple providers stacked, that MUST NOT abort the chain — the
# token may belong to a *different*, reachable provider. (Concretely:
# a self-hosted-OIDC session hits the `nous` provider first, which
# tries to reach Nous Portal's JWKS; if that's unreachable it raises,
# but the `self-hosted` provider can still verify the token.) So we
# remember the unreachable error and keep going. Only if NO provider
# verifies the token AND at least one was unreachable do we surface a
# 503 — distinguishing "transient IDP outage" (don't force re-login)
# from "token genuinely invalid" (fall through to refresh/relogin).
unreachable_provider: str | None = None
for provider in list_providers():
try:
session = provider.verify_session(access_token=at)
@ -221,12 +237,19 @@ async def gated_auth_middleware(
reason="provider_unreachable",
ip=_client_ip(request),
)
return JSONResponse(
{"detail": f"Auth provider {provider.name!r} unreachable"},
status_code=503,
)
if unreachable_provider is None:
unreachable_provider = provider.name
continue
if session is not None:
break
if session is None and unreachable_provider is not None:
# No provider could verify the token and at least one couldn't be
# reached — treat as a transient outage rather than forcing a
# re-login through a (possibly also-unreachable) refresh.
return JSONResponse(
{"detail": f"Auth provider {unreachable_provider!r} unreachable"},
status_code=503,
)
if session is None:
# Access token is expired/invalid. Before forcing re-login, try to

View File

@ -340,3 +340,127 @@ def test_gated_zero_providers_login_page_renders_help_text():
finally:
web_server.app.state.auth_required = prev_required
web_server.app.state.bound_host = prev_host
# ---------------------------------------------------------------------------
# Multi-provider verify: a ProviderError from one provider must not abort the
# chain when another provider can verify the token.
# ---------------------------------------------------------------------------
class _UnreachableProvider(StubAuthProvider):
"""A provider whose IDP is unreachable: verify_session always raises.
Models the real-world bug — a self-hosted-OIDC session hits the ``nous``
provider first, which tries to reach Nous Portal's JWKS; if that's
unreachable ``nous`` raises ProviderError. The gate must keep trying the
remaining providers rather than 503-ing the whole request.
"""
name = "unreachable"
display_name = "Unreachable IdP (test only)"
def verify_session(self, *, access_token: str):
from hermes_cli.dashboard_auth.base import ProviderError
raise ProviderError("simulated: IDP/JWKS unreachable")
def refresh_session(self, *, refresh_token: str):
from hermes_cli.dashboard_auth.base import ProviderError
raise ProviderError("simulated: IDP/JWKS unreachable")
def _mint_stub_at(stub: StubAuthProvider) -> str:
"""Mint a valid access-token cookie value from a StubAuthProvider via its
own login round trip (so the HMAC signature matches what verify expects)."""
ls = stub.start_login(redirect_uri="https://fly-app.fly.dev/auth/callback")
state = dict(
seg.split("=", 1)
for seg in ls.cookie_payload["hermes_session_pkce"].split(";")
if "=" in seg
)["state"]
verifier = dict(
seg.split("=", 1)
for seg in ls.cookie_payload["hermes_session_pkce"].split(";")
if "=" in seg
)["verifier"]
session = stub.complete_login(
code="stub_code",
state=state,
code_verifier=verifier,
redirect_uri="https://fly-app.fly.dev/auth/callback",
)
return session.access_token
@pytest.fixture
def _gated_state():
"""Bare gated app-state setup WITHOUT registering any provider, so each
test controls provider registration order itself. Yields a factory that
builds the TestClient after providers are registered."""
clear_providers()
prev_host = getattr(web_server.app.state, "bound_host", None)
prev_port = getattr(web_server.app.state, "bound_port", None)
prev_required = getattr(web_server.app.state, "auth_required", None)
web_server.app.state.bound_host = "fly-app.fly.dev"
web_server.app.state.bound_port = 443
web_server.app.state.auth_required = True
def _client() -> TestClient:
return TestClient(web_server.app, base_url="https://fly-app.fly.dev")
yield _client
clear_providers()
web_server.app.state.bound_host = prev_host
web_server.app.state.bound_port = prev_port
web_server.app.state.auth_required = prev_required
def test_unreachable_first_provider_does_not_block_second(_gated_state):
"""An unreachable provider registered FIRST must not 503 a request whose
token a later provider can verify.
Regression for the stacked-provider bug: the verify loop used to return
503 on the first provider's ProviderError, before the working provider
ever got a turn. Now it logs, continues, and the working provider wins.
"""
working = StubAuthProvider()
register_provider(_UnreachableProvider()) # registered first → tried first
register_provider(working) # the one that can verify
at = _mint_stub_at(working)
client = _gated_state()
client.cookies.set(SESSION_AT_COOKIE, at)
r = client.get("/api/auth/me")
assert r.status_code == 200, (
f"Expected the working provider to verify the session despite the "
f"unreachable one being tried first; got {r.status_code}: {r.text}"
)
body = r.json()
assert body["provider"] == "stub"
assert body["user_id"] == "stub-user-1"
def test_all_providers_unreachable_returns_503(_gated_state):
"""If NO provider can verify the token AND at least one was unreachable,
surface 503 (transient outage) rather than forcing a needless re-login."""
register_provider(_UnreachableProvider())
client = _gated_state()
# Any non-empty cookie — the unreachable provider raises before parsing.
client.cookies.set(SESSION_AT_COOKIE, "some-opaque-token")
r = client.get("/api/auth/me")
assert r.status_code == 503
assert "unreachable" in r.text.lower()
def test_unverifiable_token_with_reachable_providers_redirects(_gated_state):
"""When every provider is REACHABLE but none recognises the token (all
return None, none raises), the gate falls through to re-login — NOT 503."""
register_provider(StubAuthProvider())
client = _gated_state()
client.cookies.set(SESSION_AT_COOKIE, "garbage-not-a-real-token")
# API path → 401; HTML would 302. Either way, NOT 503.
r = client.get("/api/auth/me")
assert r.status_code == 401
assert "unreachable" not in r.text.lower()