fix(credential-pool): STATUS_DEAD for terminal OAuth failures (#32849) (#34412)

When OpenAI Codex returns 401 token_invalidated or token_revoked, the
credential is broken upstream — retrying after a TTL cooldown cannot
fix it. The existing code treated every 401/429 the same way:
STATUS_EXHAUSTED with a TTL cooldown (5 min for 401, 1 hour for 429).
After the TTL elapsed, the broken credential re-entered rotation and
immediately failed again with the same 401, surfacing as 'Failed to
generate context summary' on every context-compression cycle.

Reporter observed 7 separate 401 token_invalidated failures from the
same revoked credential in a single day; the only workaround was
removing it manually via 'hermes auth'.

Add a STATUS_DEAD terminal state. Only 401 responses whose
error.code/reason matches a known terminal OAuth state (token_invalidated,
token_revoked, invalid_token, invalid_grant, unauthorized_client,
refresh_token_reused) transition to DEAD. Everything else keeps the
existing TTL semantics — 429 rate limits are transient and should
recover.

DEAD entries are excluded from rotation unconditionally. They only
clear when an explicit write-side re-auth sync rewrites the tokens
(the existing _sync_codex_pool_entries / _sync_*_entry_from_auth_store
paths already clear last_status to None). The read-side
auth.json-sync paths also now fire on DEAD so an in-flight pool entry
can adopt fresh tokens written by another process without needing
explicit re-auth.

After 24 hours, DEAD manual entries (source='manual:*') are pruned
from the pool automatically so dead state doesn't accumulate forever.
Singleton-seeded DEAD entries (source='device_code' etc.) are kept
because _seed_from_singletons would recreate them on the next load
with the same stale tokens — pruning would be pointless. The audit
trail stays visible (label, last_error_reason, timestamps).

Closes #32849.
This commit is contained in:
Teknium
2026-05-28 23:45:42 -07:00
committed by GitHub
parent ae6817f7f7
commit 86a389fee2
2 changed files with 529 additions and 9 deletions

View File

@ -55,6 +55,38 @@ def _load_config_safe() -> Optional[dict]:
STATUS_OK = "ok"
STATUS_EXHAUSTED = "exhausted"
# Terminal failure — the credential will never recover on its own. Used for
# upstream-permanent OAuth states like ``token_invalidated`` / ``token_revoked``
# where retrying after a TTL cooldown is guaranteed to fail. ``DEAD`` entries
# are excluded from rotation unconditionally and only clear when an explicit
# write-side sync (e.g. ``_save_codex_tokens`` after a fresh device-code
# login) rewrites the tokens.
STATUS_DEAD = "dead"
# OAuth error reasons that indicate the credential is permanently invalid
# server-side and cannot be recovered by retry/refresh. Sourced from
# OpenAI Codex Responses API, Anthropic, xAI, and Google OAuth spec.
_TERMINAL_AUTH_REASONS = frozenset({
"token_invalidated", # OpenAI Codex: "Your authentication token has been invalidated."
"token_revoked", # OAuth 2.0 RFC 7009: token explicitly revoked
"invalid_token", # RFC 6750: bearer token is malformed/expired/revoked
"invalid_grant", # RFC 6749: refresh_token rejected during refresh
"unauthorized_client", # RFC 6749: client no longer authorized
"refresh_token_reused", # Single-use refresh token consumed by another process
})
# How long a DEAD manual credential is preserved before being pruned.
# Manual entries (``manual:*``) are independent credentials with no singleton
# to re-seed from, so pruning them after a quiet window cleans up dead state
# without losing recoverability — the user always has the option to re-add
# via ``hermes auth add``.
#
# Singleton-seeded entries (``device_code``, ``loopback_pkce``, ``claude_code``)
# are NOT pruned because ``_seed_from_singletons`` would just re-create them
# on the next ``load_pool()`` with the same stale singleton tokens, defeating
# the cleanup. They remain in the pool marked DEAD until an explicit re-auth
# write-side sync (``_save_codex_tokens`` etc.) clears the status.
DEAD_MANUAL_PRUNE_TTL_SECONDS = 24 * 60 * 60 # 24 hours
AUTH_TYPE_OAUTH = "oauth"
AUTH_TYPE_API_KEY = "api_key"
@ -438,6 +470,29 @@ class CredentialPool:
[entry.to_dict() for entry in self._entries],
)
def _is_terminal_auth_failure(
self,
status_code: Optional[int],
normalized_error: Dict[str, Any],
) -> bool:
"""Detect upstream-permanent OAuth failures that won't recover on TTL.
Only fires for 401 responses whose error code/reason matches a known
terminal OAuth state (token_invalidated, token_revoked, invalid_grant,
etc.). Distinguishes permanent failures from transient ones like
token_expired (refreshable) or generic 401 without a specific reason
(could be a server-side glitch worth retrying).
Returns False for non-401 status codes — 429 rate limits and 402
billing failures are transient by nature and should keep TTL semantics.
"""
if status_code != 401:
return False
reason = normalized_error.get("reason")
if not isinstance(reason, str):
return False
return reason.strip().lower() in _TERMINAL_AUTH_REASONS
def _mark_exhausted(
self,
entry: PooledCredential,
@ -445,9 +500,20 @@ class CredentialPool:
error_context: Optional[Dict[str, Any]] = None,
) -> PooledCredential:
normalized_error = _normalize_error_context(error_context)
# Permanent OAuth failures (token_invalidated, token_revoked, etc.)
# transition to STATUS_DEAD instead of STATUS_EXHAUSTED. Without this,
# a revoked credential gets a 1-hour TTL cooldown and then re-enters
# rotation, failing immediately every hour until the user manually
# removes it (issue #32849). DEAD entries are excluded from rotation
# unconditionally and only clear via an explicit re-auth write-side
# sync (``_save_codex_tokens`` after a fresh device-code login).
if self._is_terminal_auth_failure(status_code, normalized_error):
terminal_status = STATUS_DEAD
else:
terminal_status = STATUS_EXHAUSTED
updated = replace(
entry,
last_status=STATUS_EXHAUSTED,
last_status=terminal_status,
last_status_at=time.time(),
last_error_code=status_code,
last_error_reason=normalized_error.get("reason"),
@ -1158,13 +1224,14 @@ class CredentialPool:
"""
now = time.time()
cleared_any = False
entries_to_prune: List[str] = []
available: List[PooledCredential] = []
for entry in self._entries:
# For anthropic claude_code entries, sync from the credentials file
# before any status/refresh checks. This picks up tokens refreshed
# by other processes (Claude Code CLI, other Hermes profiles).
if (self.provider == "anthropic" and entry.source == "claude_code"
and entry.last_status == STATUS_EXHAUSTED):
and entry.last_status in {STATUS_EXHAUSTED, STATUS_DEAD}):
synced = self._sync_anthropic_entry_from_credentials_file(entry)
if synced is not entry:
entry = synced
@ -1175,7 +1242,7 @@ class CredentialPool:
# exhausted status stale.
if (self.provider == "nous"
and entry.source == "device_code"
and entry.last_status == STATUS_EXHAUSTED):
and entry.last_status in {STATUS_EXHAUSTED, STATUS_DEAD}):
synced = self._sync_nous_entry_from_auth_store(entry)
if synced is not entry:
entry = synced
@ -1187,7 +1254,7 @@ class CredentialPool:
# future for ChatGPT weekly windows).
if (self.provider == "openai-codex"
and entry.source == "device_code"
and entry.last_status == STATUS_EXHAUSTED):
and entry.last_status in {STATUS_EXHAUSTED, STATUS_DEAD}):
synced = self._sync_codex_entry_from_auth_store(entry)
if synced is not entry:
entry = synced
@ -1198,11 +1265,41 @@ class CredentialPool:
# xAI Grok OAuth login) has since rotated in auth.json.
if (self.provider == "xai-oauth"
and entry.source == "loopback_pkce"
and entry.last_status == STATUS_EXHAUSTED):
and entry.last_status in {STATUS_EXHAUSTED, STATUS_DEAD}):
synced = self._sync_xai_oauth_entry_from_auth_store(entry)
if synced is not entry:
entry = synced
cleared_any = True
if entry.last_status == STATUS_DEAD:
# Manual DEAD credentials get pruned after a 24h quiet window
# so the pool doesn't accumulate dead entries forever. The
# user can always re-add via ``hermes auth add``. Singleton-
# seeded DEAD entries are kept so the audit trail (label,
# last_error_reason, timestamps) stays visible — pruning them
# would just be undone by ``_seed_from_singletons`` on the
# next load anyway.
if _is_manual_source(entry.source):
dead_at = entry.last_status_at or 0
if dead_at and now - dead_at > DEAD_MANUAL_PRUNE_TTL_SECONDS:
_label = entry.label or entry.id[:8]
logger.warning(
"credential pool: pruning DEAD manual entry %s "
"(reason=%s, age=%.1fh) — re-add via `hermes auth add %s`",
_label,
entry.last_error_reason or "unknown",
(now - dead_at) / 3600.0,
self.provider,
)
# Mark for removal after the loop completes; we can't
# mutate self._entries while iterating.
entries_to_prune.append(entry.id)
cleared_any = True
# Permanently failed credentials never re-enter rotation via
# TTL. They only clear when a write-side re-auth sync rewrites
# the tokens (e.g. ``_save_codex_tokens`` after a fresh
# device-code login). The auth.json-sync paths below handle
# the re-auth case for OAuth singletons.
continue
if entry.last_status == STATUS_EXHAUSTED:
exhausted_until = _exhausted_until(entry)
if exhausted_until is not None and now < exhausted_until:
@ -1226,6 +1323,9 @@ class CredentialPool:
continue
entry = refreshed
available.append(entry)
if entries_to_prune:
pruned_ids = set(entries_to_prune)
self._entries = [e for e in self._entries if e.id not in pruned_ids]
if cleared_any:
self._persist()
return available
@ -1293,11 +1393,22 @@ class CredentialPool:
if entry is None:
return None
_label = entry.label or entry.id[:8]
logger.info(
"credential pool: marking %s exhausted (status=%s), rotating",
_label, status_code,
)
self._mark_exhausted(entry, status_code, error_context)
# Re-read the updated entry to log the correct terminal state.
updated_entry = next(
(e for e in self._entries if e.id == entry.id), entry,
)
if updated_entry.last_status == STATUS_DEAD:
logger.warning(
"credential pool: marking %s DEAD (status=%s, reason=%s) — "
"permanently failed, will NOT re-enter rotation until re-auth",
_label, status_code, updated_entry.last_error_reason or "unknown",
)
else:
logger.info(
"credential pool: marking %s exhausted (status=%s), rotating",
_label, status_code,
)
self._current_id = None
next_entry = self._select_unlocked()
if next_entry:

View File

@ -379,6 +379,415 @@ def test_mark_exhausted_and_rotate_persists_status(tmp_path, monkeypatch):
assert persisted["last_error_code"] == 402
def test_token_invalidated_marks_credential_dead(tmp_path, monkeypatch):
"""OpenAI Codex token_invalidated must mark the credential DEAD, not exhausted.
Regression for #32849: when an OAuth credential is revoked upstream, the
1-hour exhausted TTL means it re-enters rotation every hour and fails
again with the same 401 — surfacing as "Failed to generate context
summary" on context compression. Terminal OAuth failures should never
auto-recover.
"""
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
_write_auth_store(
tmp_path,
{
"version": 1,
"credential_pool": {
"openai-codex": [
{
"id": "cred-dead",
"label": "revoked",
"auth_type": "oauth",
"priority": 0,
"source": "manual:device_code",
"access_token": "revoked-at",
"refresh_token": "revoked-rt",
},
{
"id": "cred-ok",
"label": "healthy",
"auth_type": "oauth",
"priority": 1,
"source": "manual:device_code",
"access_token": "healthy-at",
"refresh_token": "healthy-rt",
},
]
},
},
)
from agent.credential_pool import load_pool, STATUS_DEAD
pool = load_pool("openai-codex")
assert pool.select().id == "cred-dead"
# Simulate the exact OpenAI Codex 401 token_invalidated response shape.
next_entry = pool.mark_exhausted_and_rotate(
status_code=401,
error_context={
"reason": "token_invalidated",
"message": "Your authentication token has been invalidated. Please try signing in again.",
},
)
# Rotation still works — we hand off to the healthy credential.
assert next_entry is not None
assert next_entry.id == "cred-ok"
# The revoked credential is now permanently marked DEAD.
auth_payload = json.loads((tmp_path / "hermes" / "auth.json").read_text())
persisted = auth_payload["credential_pool"]["openai-codex"][0]
assert persisted["last_status"] == STATUS_DEAD
assert persisted["last_error_code"] == 401
assert persisted["last_error_reason"] == "token_invalidated"
def test_dead_credential_never_re_enters_rotation_after_ttl(tmp_path, monkeypatch):
"""A DEAD credential must stay excluded regardless of how much time passes.
The exhausted TTL clears entries after 5 min (401) / 1 hour (429).
A DEAD credential has no recovery TTL — it stays dead until either
(a) an explicit re-auth write-side sync rewrites the tokens, or
(b) the manual-prune TTL elapses (covered by separate tests below).
This test verifies the core invariant in the recent-entry window.
"""
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
# DEAD entry from 2 hours ago — well past the exhausted TTLs (5min/1h)
# but well within the 24h manual-prune window.
two_hours_ago = time.time() - (2 * 3600)
_write_auth_store(
tmp_path,
{
"version": 1,
"credential_pool": {
"openai-codex": [
{
"id": "cred-dead",
"label": "revoked",
"auth_type": "oauth",
"priority": 0,
"source": "manual:device_code",
"access_token": "revoked-at",
"refresh_token": "revoked-rt",
"last_status": "dead",
"last_status_at": two_hours_ago,
"last_error_code": 401,
"last_error_reason": "token_invalidated",
},
{
"id": "cred-ok",
"label": "healthy",
"auth_type": "oauth",
"priority": 1,
"source": "manual:device_code",
"access_token": "healthy-at",
"refresh_token": "healthy-rt",
},
]
},
},
)
from agent.credential_pool import load_pool, STATUS_DEAD
pool = load_pool("openai-codex")
selected = pool.select()
# Should skip the dead entry and pick the healthy one — even though
# the dead entry has priority 0 (would normally be picked first) and
# plenty of time has passed since it was marked dead.
assert selected is not None
assert selected.id == "cred-ok"
# The DEAD entry is still marked dead on disk — not cleared by TTL.
auth_payload = json.loads((tmp_path / "hermes" / "auth.json").read_text())
dead_entry = next(e for e in auth_payload["credential_pool"]["openai-codex"]
if e["id"] == "cred-dead")
assert dead_entry["last_status"] == STATUS_DEAD
def test_429_rate_limit_still_uses_exhausted_not_dead(tmp_path, monkeypatch):
"""429 rate limits must NOT be treated as terminal.
They should keep the existing 1-hour TTL cooldown semantics so the
credential re-enters rotation once the rate window resets.
"""
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
_write_auth_store(
tmp_path,
{
"version": 1,
"credential_pool": {
"openai-codex": [
{
"id": "cred-1",
"label": "primary",
"auth_type": "oauth",
"priority": 0,
"source": "manual:device_code",
"access_token": "at-1",
"refresh_token": "rt-1",
},
{
"id": "cred-2",
"label": "secondary",
"auth_type": "oauth",
"priority": 1,
"source": "manual:device_code",
"access_token": "at-2",
"refresh_token": "rt-2",
},
]
},
},
)
from agent.credential_pool import load_pool, STATUS_EXHAUSTED
pool = load_pool("openai-codex")
assert pool.select().id == "cred-1"
next_entry = pool.mark_exhausted_and_rotate(
status_code=429,
error_context={"reason": "rate_limit_exceeded", "message": "Rate limit exceeded"},
)
assert next_entry is not None
assert next_entry.id == "cred-2"
auth_payload = json.loads((tmp_path / "hermes" / "auth.json").read_text())
persisted = auth_payload["credential_pool"]["openai-codex"][0]
# 429 stays exhausted (transient) — NOT dead.
assert persisted["last_status"] == STATUS_EXHAUSTED
assert persisted["last_error_code"] == 429
def test_generic_401_without_terminal_reason_still_uses_exhausted(tmp_path, monkeypatch):
"""A 401 with no specific code/reason should keep TTL semantics.
Only specific terminal reasons (token_invalidated, token_revoked, etc.)
transition to DEAD. A generic 401 might be a transient server-side
issue worth retrying after the 5-min TTL.
"""
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
_write_auth_store(
tmp_path,
{
"version": 1,
"credential_pool": {
"openai-codex": [
{
"id": "cred-1",
"label": "primary",
"auth_type": "oauth",
"priority": 0,
"source": "manual:device_code",
"access_token": "at-1",
"refresh_token": "rt-1",
},
{
"id": "cred-2",
"label": "secondary",
"auth_type": "oauth",
"priority": 1,
"source": "manual:device_code",
"access_token": "at-2",
"refresh_token": "rt-2",
},
]
},
},
)
from agent.credential_pool import load_pool, STATUS_EXHAUSTED
pool = load_pool("openai-codex")
pool.select()
# 401 with no specific reason — stays exhausted, NOT dead.
pool.mark_exhausted_and_rotate(
status_code=401,
error_context={"message": "Unauthorized"},
)
auth_payload = json.loads((tmp_path / "hermes" / "auth.json").read_text())
persisted = auth_payload["credential_pool"]["openai-codex"][0]
assert persisted["last_status"] == STATUS_EXHAUSTED
assert persisted["last_error_code"] == 401
def test_dead_manual_entry_pruned_after_24h(tmp_path, monkeypatch):
"""A DEAD manual entry is removed from the pool after the prune TTL.
Manual entries (``manual:*``) are independent credentials with no
singleton to re-seed from, so we can clean them up after a quiet
window without losing recoverability — the user can always re-add
via ``hermes auth add``.
"""
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
# DEAD entry from > 24h ago
long_ago = time.time() - (25 * 3600)
_write_auth_store(
tmp_path,
{
"version": 1,
"credential_pool": {
"openai-codex": [
{
"id": "cred-old-dead",
"label": "ancient-dead",
"auth_type": "oauth",
"priority": 0,
"source": "manual:device_code",
"access_token": "stale",
"refresh_token": "stale",
"last_status": "dead",
"last_status_at": long_ago,
"last_error_code": 401,
"last_error_reason": "token_invalidated",
},
{
"id": "cred-ok",
"label": "healthy",
"auth_type": "oauth",
"priority": 1,
"source": "manual:device_code",
"access_token": "healthy-at",
"refresh_token": "healthy-rt",
},
]
},
},
)
from agent.credential_pool import load_pool
pool = load_pool("openai-codex")
# Trigger _available_entries via select; that runs the prune.
selected = pool.select()
assert selected is not None
assert selected.id == "cred-ok"
# On-disk pool should have the dead entry removed.
auth_payload = json.loads((tmp_path / "hermes" / "auth.json").read_text())
persisted = auth_payload["credential_pool"]["openai-codex"]
assert len(persisted) == 1
assert persisted[0]["id"] == "cred-ok"
def test_dead_manual_entry_kept_within_24h(tmp_path, monkeypatch):
"""A DEAD manual entry stays in the pool until the prune TTL elapses.
Recent DEAD entries are kept so the audit trail (last_error_reason,
timestamps) remains visible while the user investigates. They simply
don't participate in rotation (covered by the DEAD-skip test above).
"""
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
# DEAD entry from only an hour ago — well within the 24h window
recent = time.time() - 3600
_write_auth_store(
tmp_path,
{
"version": 1,
"credential_pool": {
"openai-codex": [
{
"id": "cred-recent-dead",
"label": "recent-dead",
"auth_type": "oauth",
"priority": 0,
"source": "manual:device_code",
"access_token": "stale",
"refresh_token": "stale",
"last_status": "dead",
"last_status_at": recent,
"last_error_code": 401,
"last_error_reason": "token_invalidated",
},
{
"id": "cred-ok",
"label": "healthy",
"auth_type": "oauth",
"priority": 1,
"source": "manual:device_code",
"access_token": "healthy-at",
"refresh_token": "healthy-rt",
},
]
},
},
)
from agent.credential_pool import load_pool, STATUS_DEAD
pool = load_pool("openai-codex")
selected = pool.select()
assert selected is not None
assert selected.id == "cred-ok"
# On-disk pool should still have BOTH entries — recent dead is preserved.
auth_payload = json.loads((tmp_path / "hermes" / "auth.json").read_text())
persisted = auth_payload["credential_pool"]["openai-codex"]
assert len(persisted) == 2
dead_entry = next(e for e in persisted if e["id"] == "cred-recent-dead")
assert dead_entry["last_status"] == STATUS_DEAD
def test_dead_singleton_seeded_entry_not_pruned(tmp_path, monkeypatch):
"""A DEAD ``device_code`` entry must NOT be pruned even after 24h.
Singleton-seeded entries get re-created by ``_seed_from_singletons`` on
every ``load_pool()``, so pruning them is pointless — they reappear
immediately with the same stale singleton tokens. Keep them visible
with the DEAD marker so the user knows what's broken.
"""
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
long_ago = time.time() - (48 * 3600)
_write_auth_store(
tmp_path,
{
"version": 1,
"providers": {
"openai-codex": {
"tokens": {"access_token": "revoked-at", "refresh_token": "revoked-rt"},
"last_refresh": "2026-01-01T00:00:00Z",
"auth_mode": "chatgpt",
},
},
"credential_pool": {
"openai-codex": [
{
"id": "cred-seeded-dead",
"label": "seeded-dead",
"auth_type": "oauth",
"priority": 0,
"source": "device_code", # singleton-seeded, NOT manual
"access_token": "revoked-at",
"refresh_token": "revoked-rt",
"last_status": "dead",
"last_status_at": long_ago,
"last_error_code": 401,
"last_error_reason": "token_invalidated",
},
]
},
},
)
from agent.credential_pool import load_pool, STATUS_DEAD
pool = load_pool("openai-codex")
# No healthy entry available; select returns None (pool empty for rotation).
assert pool.select() is None
# On-disk: the singleton-seeded DEAD entry is preserved.
auth_payload = json.loads((tmp_path / "hermes" / "auth.json").read_text())
persisted = auth_payload["credential_pool"]["openai-codex"]
assert len(persisted) == 1
assert persisted[0]["id"] == "cred-seeded-dead"
assert persisted[0]["last_status"] == STATUS_DEAD
def test_load_pool_seeds_env_api_key(tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
monkeypatch.setenv("OPENROUTER_API_KEY", "sk-or-seeded")