fix(compression): fail open when lock subsystem is missing (version skew) (#34475)

A process running mismatched module versions — conversation_compression.py
re-imported with the post-#34351 lock code while a long-lived
hermes_state.SessionDB stays bound to the pre-#34351 class in memory — has
the try_acquire_compression_lock call site but not the method. The
AttributeError it raises is NOT a sqlite3.Error, so the method's own
fail-open guard never runs; the exception escapes to the outer agent loop,
which prints the error and retries. Compression never succeeds, the token
count never drops, and the loop re-triggers compaction forever (the
'API call #47/#48/#49 ... has no attribute try_acquire_compression_lock'
spin a user hit after an update).

Wrap the lock acquire so any unexpected exception fails OPEN: skip locking
and proceed with compression. Skipping the lock risks a rare
concurrent-compression session fork; an infinite no-progress loop that never
compresses at all is strictly worse. The remediation hint in the log points
at the real fix (restart / hermes update to resync the stale module).

Also guards get_compression_lock_holder against the same skew.

Adds a regression test simulating the version skew (real SessionDB wrapped
so only the lock methods raise AttributeError) — asserts _compress_context
proceeds and rotates instead of raising.
This commit is contained in:
Teknium
2026-05-29 01:32:32 -07:00
committed by GitHub
parent e28a668b40
commit db2ce9e7d2
2 changed files with 107 additions and 2 deletions

View File

@ -171,3 +171,70 @@ def test_skipped_compression_returns_messages_unchanged(tmp_path: Path) -> None:
assert agent.session_id == parent_sid
# Compressor was never called (the skip happens before .compress())
agent.context_compressor.compress.assert_not_called()
class _NoLockSubsystemDB:
"""Wraps a real SessionDB but simulates a pre-#34351 version skew.
A long-lived process can hold ``hermes_state.SessionDB`` bound to the
OLD class in memory (no compression-lock methods) while a lazily
re-imported ``conversation_compression.py`` calls the NEW lock code.
``try_acquire_compression_lock`` then raises ``AttributeError`` — which
is NOT a ``sqlite3.Error``, so the method's own fail-open guard never
runs. Before the fix the exception propagated to the outer agent loop,
which printed the error and retried; compression never succeeded, the
token count never dropped, and the loop re-triggered compaction forever.
"""
def __init__(self, real_db: SessionDB) -> None:
self._real = real_db
def try_acquire_compression_lock(self, *_a, **_k): # noqa: D401
raise AttributeError(
"'SessionDB' object has no attribute 'try_acquire_compression_lock'"
)
def get_compression_lock_holder(self, *_a, **_k):
raise AttributeError("'SessionDB' object has no attribute 'get_compression_lock_holder'")
def release_compression_lock(self, *_a, **_k):
raise AttributeError("'SessionDB' object has no attribute 'release_compression_lock'")
def __getattr__(self, name):
# Everything else (create_session, append, rotation helpers) goes to
# the real db so the post-lock compression + rotation path runs.
return getattr(self._real, name)
def test_missing_lock_subsystem_fails_open_not_infinite_loop(tmp_path: Path) -> None:
"""Version skew (no lock methods) must fail OPEN, not raise into the loop.
Reproduces the "API call #47/#48/#49 ... has no attribute
try_acquire_compression_lock" infinite-compaction spin: when the lock
subsystem is absent, ``_compress_context`` must skip locking and proceed
with compression (so the loop makes progress and terminates) instead of
letting the ``AttributeError`` escape to the retry loop.
"""
db = SessionDB(db_path=tmp_path / "state.db")
parent_sid = "SKEW_TEST_SESSION"
db.create_session(parent_sid, source="discord")
agent = _build_agent_with_db(db, parent_sid)
# Swap in the lock-less wrapper AFTER construction (the agent already
# holds a normal db reference; we only break the lock methods).
agent._session_db = _NoLockSubsystemDB(db)
messages = [{"role": "user", "content": f"m{i}"} for i in range(20)]
# MUST NOT raise AttributeError. Before the fix this raised and the
# outer loop would retry forever.
compressed, _sp = agent._compress_context(messages, "sys", approx_tokens=120_000)
# Compression actually ran (proceeded past the broken lock) and made
# progress, so the auto-compress loop would terminate.
agent.context_compressor.compress.assert_called_once()
assert len(compressed) < len(messages), (
"Compression made no progress despite failing open — loop would still spin."
)
# Session rotated (compression succeeded end-to-end).
assert agent.session_id != parent_sid