From 86a389fee29796a079599d479229a68f5e845671 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Thu, 28 May 2026 23:45:42 -0700 Subject: [PATCH] fix(credential-pool): STATUS_DEAD for terminal OAuth failures (#32849) (#34412) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When OpenAI Codex returns 401 token_invalidated or token_revoked, the credential is broken upstream — retrying after a TTL cooldown cannot fix it. The existing code treated every 401/429 the same way: STATUS_EXHAUSTED with a TTL cooldown (5 min for 401, 1 hour for 429). After the TTL elapsed, the broken credential re-entered rotation and immediately failed again with the same 401, surfacing as 'Failed to generate context summary' on every context-compression cycle. Reporter observed 7 separate 401 token_invalidated failures from the same revoked credential in a single day; the only workaround was removing it manually via 'hermes auth'. Add a STATUS_DEAD terminal state. Only 401 responses whose error.code/reason matches a known terminal OAuth state (token_invalidated, token_revoked, invalid_token, invalid_grant, unauthorized_client, refresh_token_reused) transition to DEAD. Everything else keeps the existing TTL semantics — 429 rate limits are transient and should recover. DEAD entries are excluded from rotation unconditionally. They only clear when an explicit write-side re-auth sync rewrites the tokens (the existing _sync_codex_pool_entries / _sync_*_entry_from_auth_store paths already clear last_status to None). The read-side auth.json-sync paths also now fire on DEAD so an in-flight pool entry can adopt fresh tokens written by another process without needing explicit re-auth. After 24 hours, DEAD manual entries (source='manual:*') are pruned from the pool automatically so dead state doesn't accumulate forever. Singleton-seeded DEAD entries (source='device_code' etc.) are kept because _seed_from_singletons would recreate them on the next load with the same stale tokens — pruning would be pointless. The audit trail stays visible (label, last_error_reason, timestamps). Closes #32849. --- agent/credential_pool.py | 129 ++++++++- tests/agent/test_credential_pool.py | 409 ++++++++++++++++++++++++++++ 2 files changed, 529 insertions(+), 9 deletions(-) diff --git a/agent/credential_pool.py b/agent/credential_pool.py index feb3cc06b..072199ce7 100644 --- a/agent/credential_pool.py +++ b/agent/credential_pool.py @@ -55,6 +55,38 @@ def _load_config_safe() -> Optional[dict]: STATUS_OK = "ok" STATUS_EXHAUSTED = "exhausted" +# Terminal failure — the credential will never recover on its own. Used for +# upstream-permanent OAuth states like ``token_invalidated`` / ``token_revoked`` +# where retrying after a TTL cooldown is guaranteed to fail. ``DEAD`` entries +# are excluded from rotation unconditionally and only clear when an explicit +# write-side sync (e.g. ``_save_codex_tokens`` after a fresh device-code +# login) rewrites the tokens. +STATUS_DEAD = "dead" + +# OAuth error reasons that indicate the credential is permanently invalid +# server-side and cannot be recovered by retry/refresh. Sourced from +# OpenAI Codex Responses API, Anthropic, xAI, and Google OAuth spec. +_TERMINAL_AUTH_REASONS = frozenset({ + "token_invalidated", # OpenAI Codex: "Your authentication token has been invalidated." + "token_revoked", # OAuth 2.0 RFC 7009: token explicitly revoked + "invalid_token", # RFC 6750: bearer token is malformed/expired/revoked + "invalid_grant", # RFC 6749: refresh_token rejected during refresh + "unauthorized_client", # RFC 6749: client no longer authorized + "refresh_token_reused", # Single-use refresh token consumed by another process +}) + +# How long a DEAD manual credential is preserved before being pruned. +# Manual entries (``manual:*``) are independent credentials with no singleton +# to re-seed from, so pruning them after a quiet window cleans up dead state +# without losing recoverability — the user always has the option to re-add +# via ``hermes auth add``. +# +# Singleton-seeded entries (``device_code``, ``loopback_pkce``, ``claude_code``) +# are NOT pruned because ``_seed_from_singletons`` would just re-create them +# on the next ``load_pool()`` with the same stale singleton tokens, defeating +# the cleanup. They remain in the pool marked DEAD until an explicit re-auth +# write-side sync (``_save_codex_tokens`` etc.) clears the status. +DEAD_MANUAL_PRUNE_TTL_SECONDS = 24 * 60 * 60 # 24 hours AUTH_TYPE_OAUTH = "oauth" AUTH_TYPE_API_KEY = "api_key" @@ -438,6 +470,29 @@ class CredentialPool: [entry.to_dict() for entry in self._entries], ) + def _is_terminal_auth_failure( + self, + status_code: Optional[int], + normalized_error: Dict[str, Any], + ) -> bool: + """Detect upstream-permanent OAuth failures that won't recover on TTL. + + Only fires for 401 responses whose error code/reason matches a known + terminal OAuth state (token_invalidated, token_revoked, invalid_grant, + etc.). Distinguishes permanent failures from transient ones like + token_expired (refreshable) or generic 401 without a specific reason + (could be a server-side glitch worth retrying). + + Returns False for non-401 status codes — 429 rate limits and 402 + billing failures are transient by nature and should keep TTL semantics. + """ + if status_code != 401: + return False + reason = normalized_error.get("reason") + if not isinstance(reason, str): + return False + return reason.strip().lower() in _TERMINAL_AUTH_REASONS + def _mark_exhausted( self, entry: PooledCredential, @@ -445,9 +500,20 @@ class CredentialPool: error_context: Optional[Dict[str, Any]] = None, ) -> PooledCredential: normalized_error = _normalize_error_context(error_context) + # Permanent OAuth failures (token_invalidated, token_revoked, etc.) + # transition to STATUS_DEAD instead of STATUS_EXHAUSTED. Without this, + # a revoked credential gets a 1-hour TTL cooldown and then re-enters + # rotation, failing immediately every hour until the user manually + # removes it (issue #32849). DEAD entries are excluded from rotation + # unconditionally and only clear via an explicit re-auth write-side + # sync (``_save_codex_tokens`` after a fresh device-code login). + if self._is_terminal_auth_failure(status_code, normalized_error): + terminal_status = STATUS_DEAD + else: + terminal_status = STATUS_EXHAUSTED updated = replace( entry, - last_status=STATUS_EXHAUSTED, + last_status=terminal_status, last_status_at=time.time(), last_error_code=status_code, last_error_reason=normalized_error.get("reason"), @@ -1158,13 +1224,14 @@ class CredentialPool: """ now = time.time() cleared_any = False + entries_to_prune: List[str] = [] available: List[PooledCredential] = [] for entry in self._entries: # For anthropic claude_code entries, sync from the credentials file # before any status/refresh checks. This picks up tokens refreshed # by other processes (Claude Code CLI, other Hermes profiles). if (self.provider == "anthropic" and entry.source == "claude_code" - and entry.last_status == STATUS_EXHAUSTED): + and entry.last_status in {STATUS_EXHAUSTED, STATUS_DEAD}): synced = self._sync_anthropic_entry_from_credentials_file(entry) if synced is not entry: entry = synced @@ -1175,7 +1242,7 @@ class CredentialPool: # exhausted status stale. if (self.provider == "nous" and entry.source == "device_code" - and entry.last_status == STATUS_EXHAUSTED): + and entry.last_status in {STATUS_EXHAUSTED, STATUS_DEAD}): synced = self._sync_nous_entry_from_auth_store(entry) if synced is not entry: entry = synced @@ -1187,7 +1254,7 @@ class CredentialPool: # future for ChatGPT weekly windows). if (self.provider == "openai-codex" and entry.source == "device_code" - and entry.last_status == STATUS_EXHAUSTED): + and entry.last_status in {STATUS_EXHAUSTED, STATUS_DEAD}): synced = self._sync_codex_entry_from_auth_store(entry) if synced is not entry: entry = synced @@ -1198,11 +1265,41 @@ class CredentialPool: # xAI Grok OAuth login) has since rotated in auth.json. if (self.provider == "xai-oauth" and entry.source == "loopback_pkce" - and entry.last_status == STATUS_EXHAUSTED): + and entry.last_status in {STATUS_EXHAUSTED, STATUS_DEAD}): synced = self._sync_xai_oauth_entry_from_auth_store(entry) if synced is not entry: entry = synced cleared_any = True + if entry.last_status == STATUS_DEAD: + # Manual DEAD credentials get pruned after a 24h quiet window + # so the pool doesn't accumulate dead entries forever. The + # user can always re-add via ``hermes auth add``. Singleton- + # seeded DEAD entries are kept so the audit trail (label, + # last_error_reason, timestamps) stays visible — pruning them + # would just be undone by ``_seed_from_singletons`` on the + # next load anyway. + if _is_manual_source(entry.source): + dead_at = entry.last_status_at or 0 + if dead_at and now - dead_at > DEAD_MANUAL_PRUNE_TTL_SECONDS: + _label = entry.label or entry.id[:8] + logger.warning( + "credential pool: pruning DEAD manual entry %s " + "(reason=%s, age=%.1fh) — re-add via `hermes auth add %s`", + _label, + entry.last_error_reason or "unknown", + (now - dead_at) / 3600.0, + self.provider, + ) + # Mark for removal after the loop completes; we can't + # mutate self._entries while iterating. + entries_to_prune.append(entry.id) + cleared_any = True + # Permanently failed credentials never re-enter rotation via + # TTL. They only clear when a write-side re-auth sync rewrites + # the tokens (e.g. ``_save_codex_tokens`` after a fresh + # device-code login). The auth.json-sync paths below handle + # the re-auth case for OAuth singletons. + continue if entry.last_status == STATUS_EXHAUSTED: exhausted_until = _exhausted_until(entry) if exhausted_until is not None and now < exhausted_until: @@ -1226,6 +1323,9 @@ class CredentialPool: continue entry = refreshed available.append(entry) + if entries_to_prune: + pruned_ids = set(entries_to_prune) + self._entries = [e for e in self._entries if e.id not in pruned_ids] if cleared_any: self._persist() return available @@ -1293,11 +1393,22 @@ class CredentialPool: if entry is None: return None _label = entry.label or entry.id[:8] - logger.info( - "credential pool: marking %s exhausted (status=%s), rotating", - _label, status_code, - ) self._mark_exhausted(entry, status_code, error_context) + # Re-read the updated entry to log the correct terminal state. + updated_entry = next( + (e for e in self._entries if e.id == entry.id), entry, + ) + if updated_entry.last_status == STATUS_DEAD: + logger.warning( + "credential pool: marking %s DEAD (status=%s, reason=%s) — " + "permanently failed, will NOT re-enter rotation until re-auth", + _label, status_code, updated_entry.last_error_reason or "unknown", + ) + else: + logger.info( + "credential pool: marking %s exhausted (status=%s), rotating", + _label, status_code, + ) self._current_id = None next_entry = self._select_unlocked() if next_entry: diff --git a/tests/agent/test_credential_pool.py b/tests/agent/test_credential_pool.py index 69b30730e..b783c7ab6 100644 --- a/tests/agent/test_credential_pool.py +++ b/tests/agent/test_credential_pool.py @@ -379,6 +379,415 @@ def test_mark_exhausted_and_rotate_persists_status(tmp_path, monkeypatch): assert persisted["last_error_code"] == 402 +def test_token_invalidated_marks_credential_dead(tmp_path, monkeypatch): + """OpenAI Codex token_invalidated must mark the credential DEAD, not exhausted. + + Regression for #32849: when an OAuth credential is revoked upstream, the + 1-hour exhausted TTL means it re-enters rotation every hour and fails + again with the same 401 — surfacing as "Failed to generate context + summary" on context compression. Terminal OAuth failures should never + auto-recover. + """ + monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes")) + _write_auth_store( + tmp_path, + { + "version": 1, + "credential_pool": { + "openai-codex": [ + { + "id": "cred-dead", + "label": "revoked", + "auth_type": "oauth", + "priority": 0, + "source": "manual:device_code", + "access_token": "revoked-at", + "refresh_token": "revoked-rt", + }, + { + "id": "cred-ok", + "label": "healthy", + "auth_type": "oauth", + "priority": 1, + "source": "manual:device_code", + "access_token": "healthy-at", + "refresh_token": "healthy-rt", + }, + ] + }, + }, + ) + + from agent.credential_pool import load_pool, STATUS_DEAD + + pool = load_pool("openai-codex") + assert pool.select().id == "cred-dead" + + # Simulate the exact OpenAI Codex 401 token_invalidated response shape. + next_entry = pool.mark_exhausted_and_rotate( + status_code=401, + error_context={ + "reason": "token_invalidated", + "message": "Your authentication token has been invalidated. Please try signing in again.", + }, + ) + + # Rotation still works — we hand off to the healthy credential. + assert next_entry is not None + assert next_entry.id == "cred-ok" + + # The revoked credential is now permanently marked DEAD. + auth_payload = json.loads((tmp_path / "hermes" / "auth.json").read_text()) + persisted = auth_payload["credential_pool"]["openai-codex"][0] + assert persisted["last_status"] == STATUS_DEAD + assert persisted["last_error_code"] == 401 + assert persisted["last_error_reason"] == "token_invalidated" + + +def test_dead_credential_never_re_enters_rotation_after_ttl(tmp_path, monkeypatch): + """A DEAD credential must stay excluded regardless of how much time passes. + + The exhausted TTL clears entries after 5 min (401) / 1 hour (429). + A DEAD credential has no recovery TTL — it stays dead until either + (a) an explicit re-auth write-side sync rewrites the tokens, or + (b) the manual-prune TTL elapses (covered by separate tests below). + This test verifies the core invariant in the recent-entry window. + """ + monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes")) + # DEAD entry from 2 hours ago — well past the exhausted TTLs (5min/1h) + # but well within the 24h manual-prune window. + two_hours_ago = time.time() - (2 * 3600) + _write_auth_store( + tmp_path, + { + "version": 1, + "credential_pool": { + "openai-codex": [ + { + "id": "cred-dead", + "label": "revoked", + "auth_type": "oauth", + "priority": 0, + "source": "manual:device_code", + "access_token": "revoked-at", + "refresh_token": "revoked-rt", + "last_status": "dead", + "last_status_at": two_hours_ago, + "last_error_code": 401, + "last_error_reason": "token_invalidated", + }, + { + "id": "cred-ok", + "label": "healthy", + "auth_type": "oauth", + "priority": 1, + "source": "manual:device_code", + "access_token": "healthy-at", + "refresh_token": "healthy-rt", + }, + ] + }, + }, + ) + + from agent.credential_pool import load_pool, STATUS_DEAD + + pool = load_pool("openai-codex") + selected = pool.select() + # Should skip the dead entry and pick the healthy one — even though + # the dead entry has priority 0 (would normally be picked first) and + # plenty of time has passed since it was marked dead. + assert selected is not None + assert selected.id == "cred-ok" + + # The DEAD entry is still marked dead on disk — not cleared by TTL. + auth_payload = json.loads((tmp_path / "hermes" / "auth.json").read_text()) + dead_entry = next(e for e in auth_payload["credential_pool"]["openai-codex"] + if e["id"] == "cred-dead") + assert dead_entry["last_status"] == STATUS_DEAD + + +def test_429_rate_limit_still_uses_exhausted_not_dead(tmp_path, monkeypatch): + """429 rate limits must NOT be treated as terminal. + + They should keep the existing 1-hour TTL cooldown semantics so the + credential re-enters rotation once the rate window resets. + """ + monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes")) + _write_auth_store( + tmp_path, + { + "version": 1, + "credential_pool": { + "openai-codex": [ + { + "id": "cred-1", + "label": "primary", + "auth_type": "oauth", + "priority": 0, + "source": "manual:device_code", + "access_token": "at-1", + "refresh_token": "rt-1", + }, + { + "id": "cred-2", + "label": "secondary", + "auth_type": "oauth", + "priority": 1, + "source": "manual:device_code", + "access_token": "at-2", + "refresh_token": "rt-2", + }, + ] + }, + }, + ) + + from agent.credential_pool import load_pool, STATUS_EXHAUSTED + + pool = load_pool("openai-codex") + assert pool.select().id == "cred-1" + + next_entry = pool.mark_exhausted_and_rotate( + status_code=429, + error_context={"reason": "rate_limit_exceeded", "message": "Rate limit exceeded"}, + ) + assert next_entry is not None + assert next_entry.id == "cred-2" + + auth_payload = json.loads((tmp_path / "hermes" / "auth.json").read_text()) + persisted = auth_payload["credential_pool"]["openai-codex"][0] + # 429 stays exhausted (transient) — NOT dead. + assert persisted["last_status"] == STATUS_EXHAUSTED + assert persisted["last_error_code"] == 429 + + +def test_generic_401_without_terminal_reason_still_uses_exhausted(tmp_path, monkeypatch): + """A 401 with no specific code/reason should keep TTL semantics. + + Only specific terminal reasons (token_invalidated, token_revoked, etc.) + transition to DEAD. A generic 401 might be a transient server-side + issue worth retrying after the 5-min TTL. + """ + monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes")) + _write_auth_store( + tmp_path, + { + "version": 1, + "credential_pool": { + "openai-codex": [ + { + "id": "cred-1", + "label": "primary", + "auth_type": "oauth", + "priority": 0, + "source": "manual:device_code", + "access_token": "at-1", + "refresh_token": "rt-1", + }, + { + "id": "cred-2", + "label": "secondary", + "auth_type": "oauth", + "priority": 1, + "source": "manual:device_code", + "access_token": "at-2", + "refresh_token": "rt-2", + }, + ] + }, + }, + ) + + from agent.credential_pool import load_pool, STATUS_EXHAUSTED + + pool = load_pool("openai-codex") + pool.select() + + # 401 with no specific reason — stays exhausted, NOT dead. + pool.mark_exhausted_and_rotate( + status_code=401, + error_context={"message": "Unauthorized"}, + ) + + auth_payload = json.loads((tmp_path / "hermes" / "auth.json").read_text()) + persisted = auth_payload["credential_pool"]["openai-codex"][0] + assert persisted["last_status"] == STATUS_EXHAUSTED + assert persisted["last_error_code"] == 401 + + +def test_dead_manual_entry_pruned_after_24h(tmp_path, monkeypatch): + """A DEAD manual entry is removed from the pool after the prune TTL. + + Manual entries (``manual:*``) are independent credentials with no + singleton to re-seed from, so we can clean them up after a quiet + window without losing recoverability — the user can always re-add + via ``hermes auth add``. + """ + monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes")) + # DEAD entry from > 24h ago + long_ago = time.time() - (25 * 3600) + _write_auth_store( + tmp_path, + { + "version": 1, + "credential_pool": { + "openai-codex": [ + { + "id": "cred-old-dead", + "label": "ancient-dead", + "auth_type": "oauth", + "priority": 0, + "source": "manual:device_code", + "access_token": "stale", + "refresh_token": "stale", + "last_status": "dead", + "last_status_at": long_ago, + "last_error_code": 401, + "last_error_reason": "token_invalidated", + }, + { + "id": "cred-ok", + "label": "healthy", + "auth_type": "oauth", + "priority": 1, + "source": "manual:device_code", + "access_token": "healthy-at", + "refresh_token": "healthy-rt", + }, + ] + }, + }, + ) + + from agent.credential_pool import load_pool + + pool = load_pool("openai-codex") + # Trigger _available_entries via select; that runs the prune. + selected = pool.select() + assert selected is not None + assert selected.id == "cred-ok" + + # On-disk pool should have the dead entry removed. + auth_payload = json.loads((tmp_path / "hermes" / "auth.json").read_text()) + persisted = auth_payload["credential_pool"]["openai-codex"] + assert len(persisted) == 1 + assert persisted[0]["id"] == "cred-ok" + + +def test_dead_manual_entry_kept_within_24h(tmp_path, monkeypatch): + """A DEAD manual entry stays in the pool until the prune TTL elapses. + + Recent DEAD entries are kept so the audit trail (last_error_reason, + timestamps) remains visible while the user investigates. They simply + don't participate in rotation (covered by the DEAD-skip test above). + """ + monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes")) + # DEAD entry from only an hour ago — well within the 24h window + recent = time.time() - 3600 + _write_auth_store( + tmp_path, + { + "version": 1, + "credential_pool": { + "openai-codex": [ + { + "id": "cred-recent-dead", + "label": "recent-dead", + "auth_type": "oauth", + "priority": 0, + "source": "manual:device_code", + "access_token": "stale", + "refresh_token": "stale", + "last_status": "dead", + "last_status_at": recent, + "last_error_code": 401, + "last_error_reason": "token_invalidated", + }, + { + "id": "cred-ok", + "label": "healthy", + "auth_type": "oauth", + "priority": 1, + "source": "manual:device_code", + "access_token": "healthy-at", + "refresh_token": "healthy-rt", + }, + ] + }, + }, + ) + + from agent.credential_pool import load_pool, STATUS_DEAD + + pool = load_pool("openai-codex") + selected = pool.select() + assert selected is not None + assert selected.id == "cred-ok" + + # On-disk pool should still have BOTH entries — recent dead is preserved. + auth_payload = json.loads((tmp_path / "hermes" / "auth.json").read_text()) + persisted = auth_payload["credential_pool"]["openai-codex"] + assert len(persisted) == 2 + dead_entry = next(e for e in persisted if e["id"] == "cred-recent-dead") + assert dead_entry["last_status"] == STATUS_DEAD + + +def test_dead_singleton_seeded_entry_not_pruned(tmp_path, monkeypatch): + """A DEAD ``device_code`` entry must NOT be pruned even after 24h. + + Singleton-seeded entries get re-created by ``_seed_from_singletons`` on + every ``load_pool()``, so pruning them is pointless — they reappear + immediately with the same stale singleton tokens. Keep them visible + with the DEAD marker so the user knows what's broken. + """ + monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes")) + long_ago = time.time() - (48 * 3600) + _write_auth_store( + tmp_path, + { + "version": 1, + "providers": { + "openai-codex": { + "tokens": {"access_token": "revoked-at", "refresh_token": "revoked-rt"}, + "last_refresh": "2026-01-01T00:00:00Z", + "auth_mode": "chatgpt", + }, + }, + "credential_pool": { + "openai-codex": [ + { + "id": "cred-seeded-dead", + "label": "seeded-dead", + "auth_type": "oauth", + "priority": 0, + "source": "device_code", # singleton-seeded, NOT manual + "access_token": "revoked-at", + "refresh_token": "revoked-rt", + "last_status": "dead", + "last_status_at": long_ago, + "last_error_code": 401, + "last_error_reason": "token_invalidated", + }, + ] + }, + }, + ) + + from agent.credential_pool import load_pool, STATUS_DEAD + + pool = load_pool("openai-codex") + # No healthy entry available; select returns None (pool empty for rotation). + assert pool.select() is None + + # On-disk: the singleton-seeded DEAD entry is preserved. + auth_payload = json.loads((tmp_path / "hermes" / "auth.json").read_text()) + persisted = auth_payload["credential_pool"]["openai-codex"] + assert len(persisted) == 1 + assert persisted[0]["id"] == "cred-seeded-dead" + assert persisted[0]["last_status"] == STATUS_DEAD + + def test_load_pool_seeds_env_api_key(tmp_path, monkeypatch): monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes")) monkeypatch.setenv("OPENROUTER_API_KEY", "sk-or-seeded")