fix: keep CLI context display in sync with preflight token estimate (#35079)

* Inspired by Claude Code: /compress here [N] — boundary-aware 'summarize up to here'

Adds a user-chosen compression boundary to the existing /compress command.
/compress here [N] summarizes everything except the most recent N exchanges
(default 2), which are preserved verbatim — letting the user pick the
compression boundary instead of relying on the automatic token-budget heuristic.

Inspired by Claude Code's Rewind 'Summarize up to here' action (v2.1.139,
Week 20, May 2026): https://code.claude.com/docs/en/whats-new/2026-w20

- hermes_cli/partial_compress.py: pure split/parse helpers + seam-alternation
  guard (shared by CLI and gateway).
- cli.py / gateway/run.py: route 'here [N]' / '--keep N' to partial compression;
  compress only the head, re-append the verbatim tail through the seam guard.
- Preserves message-flow role alternation (seam guard merges any illegal
  user->user / assistant->assistant adjacency).
- Reuses the existing _compress_context session-rotation/lock machinery — no
  changes to the compression core.
- Bare /compress (full) and /compress <focus> behavior unchanged.

Tests: 12 helper unit tests + 5 CLI integration tests + E2E (interleaved
tool-call transcript, degenerate/multimodal seams, real handler path).

* fix: keep CLI context display in sync with preflight token estimate

The status bar reads compressor.last_prompt_tokens, which only updates
from a successful API response. When loaded history is oversized but
compression no-ops (e.g. the auxiliary summary model times out), no fresh
usage arrives and the bar stays frozen at the old, smaller value while the
preflight estimate reports a much larger number — looking permanently out
of sync (reported: 74.4K display vs ~144,669 preflight).

Seed last_prompt_tokens with the fresh preflight estimate (upward-only, so
a real usage figure is never clobbered and a successful compression's
downward correction still wins). Display-only; no behavioral change to
compression, caching, or the agent loop.
This commit is contained in:
Teknium
2026-05-29 19:21:15 -07:00
committed by GitHub
parent 9d4c81130a
commit 897f9533ed
2 changed files with 87 additions and 1 deletions

View File

@ -606,8 +606,26 @@ def run_conversation(
"should_defer_preflight_to_real_usage",
lambda _tokens: False,
)
_preflight_deferred = _defer_preflight(_preflight_tokens)
if _defer_preflight(_preflight_tokens):
if not _preflight_deferred:
# Keep the CLI/ACP context display in sync with what preflight
# actually measured. The status bar reads
# ``compressor.last_prompt_tokens``, which otherwise only updates
# from a *successful* API response. When the conversation has grown
# since the last successful call — or when compression then fails
# (e.g. the auxiliary summary model times out) and no fresh usage
# arrives — the bar stays stuck at the old, smaller value while
# preflight reports a much larger number, looking out of sync.
# Seed it with the fresh estimate (only ever revising upward; a real
# ``update_from_response`` will correct it after the next API call).
# Skipped when deferring — a deferred estimate is known to over-count
# vs the last real provider prompt, so trusting it for the display
# would re-introduce the very desync we're avoiding.
if _preflight_tokens > (_compressor.last_prompt_tokens or 0):
_compressor.last_prompt_tokens = _preflight_tokens
if _preflight_deferred:
logger.info(
"Skipping preflight compression: rough estimate ~%s >= %s, "
"but last real provider prompt was %s after compression",

View File

@ -665,6 +665,74 @@ class TestPreflightCompression:
mock_compress.assert_not_called()
assert result["completed"] is True
def test_preflight_seeds_display_tokens_when_compression_aborts(self, agent):
"""Display must reflect the real context size even when compression no-ops.
Regression: the CLI status bar reads ``last_prompt_tokens``, which only
updated from a *successful* API response. When the loaded history was
oversized but compression failed to reduce it (e.g. the auxiliary
summary model timed out), the bar stayed stuck at the old, smaller
value while the preflight estimate reported a much larger number —
looking permanently out of sync.
"""
agent.compression_enabled = True
agent.context_compressor.context_length = 200_000
agent.context_compressor.threshold_tokens = 130_000
# Simulate a stale display value from an earlier, smaller turn.
agent.context_compressor.last_prompt_tokens = 74_400
big_history = []
for i in range(20):
big_history.append({"role": "user", "content": f"Message {i} padded text"})
big_history.append({"role": "assistant", "content": f"Response {i} padded text"})
ok_resp = _mock_response(content="After preflight", finish_reason="stop")
agent.client.chat.completions.create.side_effect = [ok_resp]
with (
patch("agent.conversation_loop.estimate_request_tokens_rough", return_value=144_669),
# Compression no-ops (returns input unchanged) — mirrors an aux
# summary-model timeout where the messages can't be reduced.
patch.object(agent, "_compress_context", side_effect=lambda msgs, *a, **k: (msgs, agent._cached_system_prompt)),
patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"),
patch.object(agent, "_cleanup_task_resources"),
):
result = agent.run_conversation("hello", conversation_history=big_history)
assert result["completed"] is True
# The display token count was revised up to the fresh preflight estimate,
# not left at the stale 74_400.
assert agent.context_compressor.last_prompt_tokens == 144_669
def test_preflight_seed_only_revises_upward(self, agent):
"""A larger tracked value must not be clobbered by a smaller estimate."""
agent.compression_enabled = True
agent.context_compressor.context_length = 200_000
agent.context_compressor.threshold_tokens = 130_000
# A real, larger usage figure is already tracked.
agent.context_compressor.last_prompt_tokens = 160_000
big_history = []
for i in range(20):
big_history.append({"role": "user", "content": f"Message {i} padded text"})
big_history.append({"role": "assistant", "content": f"Response {i} padded text"})
ok_resp = _mock_response(content="After preflight", finish_reason="stop")
agent.client.chat.completions.create.side_effect = [ok_resp]
with (
patch("agent.conversation_loop.estimate_request_tokens_rough", return_value=144_669),
patch.object(agent, "_compress_context", side_effect=lambda msgs, *a, **k: (msgs, agent._cached_system_prompt)),
patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"),
patch.object(agent, "_cleanup_task_resources"),
):
agent.run_conversation("hello", conversation_history=big_history)
# Smaller estimate must not overwrite the larger tracked value.
assert agent.context_compressor.last_prompt_tokens == 160_000
class TestToolResultPreflightCompression:
"""Compression should trigger when tool results push context past the threshold."""