fix(dashboard-auth): don't abort verify chain on one provider's ProviderError
The gated dashboard verifies a session cookie by trying each registered DashboardAuthProvider's verify_session in turn (the session cookie stores only the access token, not which provider issued it). A provider that doesn't recognise a token returns None; a provider whose IDP/JWKS is unreachable raises ProviderError. The loop used to return HTTP 503 on the FIRST ProviderError, before any later provider got a turn. With multiple providers stacked, that means an unreachable IDP for a session you didn't even use blocks login through a different, reachable provider. Concrete repro: a self-hosted-OIDC session hits the 'nous' provider first (registered earlier); nous tries to reach Nous Portal's JWKS, which is unreachable in a self-hosted deployment, so it raises — and the gate 503s before the 'self-hosted' provider can verify the token. Hit live while testing the new self-hosted OIDC plugin against a local Keycloak. Fix: a ProviderError from one provider is logged and the loop continues to the next. A 503 is returned only if NO provider verified the token AND at least one was unreachable — distinguishing a transient IDP outage (don't force a needless re-login) from a token that's genuinely invalid (fall through to refresh/relogin). Single-provider behaviour is unchanged. Tests: adds an _UnreachableProvider stub and three cases — unreachable provider first must not block a working second; all-unreachable still 503s; reachable-but-unrecognised falls through to 401/relogin (not 503). Mutation-tested: reverting the fix makes the first case fail with the exact 503 bug.
This commit is contained in:
@ -207,6 +207,22 @@ async def gated_auth_middleware(
|
||||
# good refresh token — defeating the whole transparent-refresh feature.
|
||||
session = None
|
||||
if at:
|
||||
# Try every registered provider's verify_session in turn. A provider
|
||||
# that doesn't recognise the token returns None and we move on; the
|
||||
# first provider that returns a Session wins.
|
||||
#
|
||||
# A provider may instead raise ProviderError (its IDP/JWKS is
|
||||
# unreachable, so it can neither confirm nor deny the token). With
|
||||
# multiple providers stacked, that MUST NOT abort the chain — the
|
||||
# token may belong to a *different*, reachable provider. (Concretely:
|
||||
# a self-hosted-OIDC session hits the `nous` provider first, which
|
||||
# tries to reach Nous Portal's JWKS; if that's unreachable it raises,
|
||||
# but the `self-hosted` provider can still verify the token.) So we
|
||||
# remember the unreachable error and keep going. Only if NO provider
|
||||
# verifies the token AND at least one was unreachable do we surface a
|
||||
# 503 — distinguishing "transient IDP outage" (don't force re-login)
|
||||
# from "token genuinely invalid" (fall through to refresh/relogin).
|
||||
unreachable_provider: str | None = None
|
||||
for provider in list_providers():
|
||||
try:
|
||||
session = provider.verify_session(access_token=at)
|
||||
@ -221,12 +237,19 @@ async def gated_auth_middleware(
|
||||
reason="provider_unreachable",
|
||||
ip=_client_ip(request),
|
||||
)
|
||||
return JSONResponse(
|
||||
{"detail": f"Auth provider {provider.name!r} unreachable"},
|
||||
status_code=503,
|
||||
)
|
||||
if unreachable_provider is None:
|
||||
unreachable_provider = provider.name
|
||||
continue
|
||||
if session is not None:
|
||||
break
|
||||
if session is None and unreachable_provider is not None:
|
||||
# No provider could verify the token and at least one couldn't be
|
||||
# reached — treat as a transient outage rather than forcing a
|
||||
# re-login through a (possibly also-unreachable) refresh.
|
||||
return JSONResponse(
|
||||
{"detail": f"Auth provider {unreachable_provider!r} unreachable"},
|
||||
status_code=503,
|
||||
)
|
||||
|
||||
if session is None:
|
||||
# Access token is expired/invalid. Before forcing re-login, try to
|
||||
|
||||
@ -340,3 +340,127 @@ def test_gated_zero_providers_login_page_renders_help_text():
|
||||
finally:
|
||||
web_server.app.state.auth_required = prev_required
|
||||
web_server.app.state.bound_host = prev_host
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Multi-provider verify: a ProviderError from one provider must not abort the
|
||||
# chain when another provider can verify the token.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class _UnreachableProvider(StubAuthProvider):
|
||||
"""A provider whose IDP is unreachable: verify_session always raises.
|
||||
|
||||
Models the real-world bug — a self-hosted-OIDC session hits the ``nous``
|
||||
provider first, which tries to reach Nous Portal's JWKS; if that's
|
||||
unreachable ``nous`` raises ProviderError. The gate must keep trying the
|
||||
remaining providers rather than 503-ing the whole request.
|
||||
"""
|
||||
|
||||
name = "unreachable"
|
||||
display_name = "Unreachable IdP (test only)"
|
||||
|
||||
def verify_session(self, *, access_token: str):
|
||||
from hermes_cli.dashboard_auth.base import ProviderError
|
||||
|
||||
raise ProviderError("simulated: IDP/JWKS unreachable")
|
||||
|
||||
def refresh_session(self, *, refresh_token: str):
|
||||
from hermes_cli.dashboard_auth.base import ProviderError
|
||||
|
||||
raise ProviderError("simulated: IDP/JWKS unreachable")
|
||||
|
||||
|
||||
def _mint_stub_at(stub: StubAuthProvider) -> str:
|
||||
"""Mint a valid access-token cookie value from a StubAuthProvider via its
|
||||
own login round trip (so the HMAC signature matches what verify expects)."""
|
||||
ls = stub.start_login(redirect_uri="https://fly-app.fly.dev/auth/callback")
|
||||
state = dict(
|
||||
seg.split("=", 1)
|
||||
for seg in ls.cookie_payload["hermes_session_pkce"].split(";")
|
||||
if "=" in seg
|
||||
)["state"]
|
||||
verifier = dict(
|
||||
seg.split("=", 1)
|
||||
for seg in ls.cookie_payload["hermes_session_pkce"].split(";")
|
||||
if "=" in seg
|
||||
)["verifier"]
|
||||
session = stub.complete_login(
|
||||
code="stub_code",
|
||||
state=state,
|
||||
code_verifier=verifier,
|
||||
redirect_uri="https://fly-app.fly.dev/auth/callback",
|
||||
)
|
||||
return session.access_token
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def _gated_state():
|
||||
"""Bare gated app-state setup WITHOUT registering any provider, so each
|
||||
test controls provider registration order itself. Yields a factory that
|
||||
builds the TestClient after providers are registered."""
|
||||
clear_providers()
|
||||
prev_host = getattr(web_server.app.state, "bound_host", None)
|
||||
prev_port = getattr(web_server.app.state, "bound_port", None)
|
||||
prev_required = getattr(web_server.app.state, "auth_required", None)
|
||||
web_server.app.state.bound_host = "fly-app.fly.dev"
|
||||
web_server.app.state.bound_port = 443
|
||||
web_server.app.state.auth_required = True
|
||||
|
||||
def _client() -> TestClient:
|
||||
return TestClient(web_server.app, base_url="https://fly-app.fly.dev")
|
||||
|
||||
yield _client
|
||||
clear_providers()
|
||||
web_server.app.state.bound_host = prev_host
|
||||
web_server.app.state.bound_port = prev_port
|
||||
web_server.app.state.auth_required = prev_required
|
||||
|
||||
|
||||
def test_unreachable_first_provider_does_not_block_second(_gated_state):
|
||||
"""An unreachable provider registered FIRST must not 503 a request whose
|
||||
token a later provider can verify.
|
||||
|
||||
Regression for the stacked-provider bug: the verify loop used to return
|
||||
503 on the first provider's ProviderError, before the working provider
|
||||
ever got a turn. Now it logs, continues, and the working provider wins.
|
||||
"""
|
||||
working = StubAuthProvider()
|
||||
register_provider(_UnreachableProvider()) # registered first → tried first
|
||||
register_provider(working) # the one that can verify
|
||||
|
||||
at = _mint_stub_at(working)
|
||||
client = _gated_state()
|
||||
client.cookies.set(SESSION_AT_COOKIE, at)
|
||||
r = client.get("/api/auth/me")
|
||||
assert r.status_code == 200, (
|
||||
f"Expected the working provider to verify the session despite the "
|
||||
f"unreachable one being tried first; got {r.status_code}: {r.text}"
|
||||
)
|
||||
body = r.json()
|
||||
assert body["provider"] == "stub"
|
||||
assert body["user_id"] == "stub-user-1"
|
||||
|
||||
|
||||
def test_all_providers_unreachable_returns_503(_gated_state):
|
||||
"""If NO provider can verify the token AND at least one was unreachable,
|
||||
surface 503 (transient outage) rather than forcing a needless re-login."""
|
||||
register_provider(_UnreachableProvider())
|
||||
client = _gated_state()
|
||||
# Any non-empty cookie — the unreachable provider raises before parsing.
|
||||
client.cookies.set(SESSION_AT_COOKIE, "some-opaque-token")
|
||||
r = client.get("/api/auth/me")
|
||||
assert r.status_code == 503
|
||||
assert "unreachable" in r.text.lower()
|
||||
|
||||
|
||||
def test_unverifiable_token_with_reachable_providers_redirects(_gated_state):
|
||||
"""When every provider is REACHABLE but none recognises the token (all
|
||||
return None, none raises), the gate falls through to re-login — NOT 503."""
|
||||
register_provider(StubAuthProvider())
|
||||
client = _gated_state()
|
||||
client.cookies.set(SESSION_AT_COOKIE, "garbage-not-a-real-token")
|
||||
# API path → 401; HTML would 302. Either way, NOT 503.
|
||||
r = client.get("/api/auth/me")
|
||||
assert r.status_code == 401
|
||||
assert "unreachable" not in r.text.lower()
|
||||
|
||||
Reference in New Issue
Block a user