fix: keep CLI context display in sync with preflight token estimate (#35079)
* Inspired by Claude Code: /compress here [N] — boundary-aware 'summarize up to here' Adds a user-chosen compression boundary to the existing /compress command. /compress here [N] summarizes everything except the most recent N exchanges (default 2), which are preserved verbatim — letting the user pick the compression boundary instead of relying on the automatic token-budget heuristic. Inspired by Claude Code's Rewind 'Summarize up to here' action (v2.1.139, Week 20, May 2026): https://code.claude.com/docs/en/whats-new/2026-w20 - hermes_cli/partial_compress.py: pure split/parse helpers + seam-alternation guard (shared by CLI and gateway). - cli.py / gateway/run.py: route 'here [N]' / '--keep N' to partial compression; compress only the head, re-append the verbatim tail through the seam guard. - Preserves message-flow role alternation (seam guard merges any illegal user->user / assistant->assistant adjacency). - Reuses the existing _compress_context session-rotation/lock machinery — no changes to the compression core. - Bare /compress (full) and /compress <focus> behavior unchanged. Tests: 12 helper unit tests + 5 CLI integration tests + E2E (interleaved tool-call transcript, degenerate/multimodal seams, real handler path). * fix: keep CLI context display in sync with preflight token estimate The status bar reads compressor.last_prompt_tokens, which only updates from a successful API response. When loaded history is oversized but compression no-ops (e.g. the auxiliary summary model times out), no fresh usage arrives and the bar stays frozen at the old, smaller value while the preflight estimate reports a much larger number — looking permanently out of sync (reported: 74.4K display vs ~144,669 preflight). Seed last_prompt_tokens with the fresh preflight estimate (upward-only, so a real usage figure is never clobbered and a successful compression's downward correction still wins). Display-only; no behavioral change to compression, caching, or the agent loop.
This commit is contained in:
@ -606,8 +606,26 @@ def run_conversation(
|
||||
"should_defer_preflight_to_real_usage",
|
||||
lambda _tokens: False,
|
||||
)
|
||||
_preflight_deferred = _defer_preflight(_preflight_tokens)
|
||||
|
||||
if _defer_preflight(_preflight_tokens):
|
||||
if not _preflight_deferred:
|
||||
# Keep the CLI/ACP context display in sync with what preflight
|
||||
# actually measured. The status bar reads
|
||||
# ``compressor.last_prompt_tokens``, which otherwise only updates
|
||||
# from a *successful* API response. When the conversation has grown
|
||||
# since the last successful call — or when compression then fails
|
||||
# (e.g. the auxiliary summary model times out) and no fresh usage
|
||||
# arrives — the bar stays stuck at the old, smaller value while
|
||||
# preflight reports a much larger number, looking out of sync.
|
||||
# Seed it with the fresh estimate (only ever revising upward; a real
|
||||
# ``update_from_response`` will correct it after the next API call).
|
||||
# Skipped when deferring — a deferred estimate is known to over-count
|
||||
# vs the last real provider prompt, so trusting it for the display
|
||||
# would re-introduce the very desync we're avoiding.
|
||||
if _preflight_tokens > (_compressor.last_prompt_tokens or 0):
|
||||
_compressor.last_prompt_tokens = _preflight_tokens
|
||||
|
||||
if _preflight_deferred:
|
||||
logger.info(
|
||||
"Skipping preflight compression: rough estimate ~%s >= %s, "
|
||||
"but last real provider prompt was %s after compression",
|
||||
|
||||
@ -665,6 +665,74 @@ class TestPreflightCompression:
|
||||
mock_compress.assert_not_called()
|
||||
assert result["completed"] is True
|
||||
|
||||
def test_preflight_seeds_display_tokens_when_compression_aborts(self, agent):
|
||||
"""Display must reflect the real context size even when compression no-ops.
|
||||
|
||||
Regression: the CLI status bar reads ``last_prompt_tokens``, which only
|
||||
updated from a *successful* API response. When the loaded history was
|
||||
oversized but compression failed to reduce it (e.g. the auxiliary
|
||||
summary model timed out), the bar stayed stuck at the old, smaller
|
||||
value while the preflight estimate reported a much larger number —
|
||||
looking permanently out of sync.
|
||||
"""
|
||||
agent.compression_enabled = True
|
||||
agent.context_compressor.context_length = 200_000
|
||||
agent.context_compressor.threshold_tokens = 130_000
|
||||
# Simulate a stale display value from an earlier, smaller turn.
|
||||
agent.context_compressor.last_prompt_tokens = 74_400
|
||||
|
||||
big_history = []
|
||||
for i in range(20):
|
||||
big_history.append({"role": "user", "content": f"Message {i} padded text"})
|
||||
big_history.append({"role": "assistant", "content": f"Response {i} padded text"})
|
||||
|
||||
ok_resp = _mock_response(content="After preflight", finish_reason="stop")
|
||||
agent.client.chat.completions.create.side_effect = [ok_resp]
|
||||
|
||||
with (
|
||||
patch("agent.conversation_loop.estimate_request_tokens_rough", return_value=144_669),
|
||||
# Compression no-ops (returns input unchanged) — mirrors an aux
|
||||
# summary-model timeout where the messages can't be reduced.
|
||||
patch.object(agent, "_compress_context", side_effect=lambda msgs, *a, **k: (msgs, agent._cached_system_prompt)),
|
||||
patch.object(agent, "_persist_session"),
|
||||
patch.object(agent, "_save_trajectory"),
|
||||
patch.object(agent, "_cleanup_task_resources"),
|
||||
):
|
||||
result = agent.run_conversation("hello", conversation_history=big_history)
|
||||
|
||||
assert result["completed"] is True
|
||||
# The display token count was revised up to the fresh preflight estimate,
|
||||
# not left at the stale 74_400.
|
||||
assert agent.context_compressor.last_prompt_tokens == 144_669
|
||||
|
||||
def test_preflight_seed_only_revises_upward(self, agent):
|
||||
"""A larger tracked value must not be clobbered by a smaller estimate."""
|
||||
agent.compression_enabled = True
|
||||
agent.context_compressor.context_length = 200_000
|
||||
agent.context_compressor.threshold_tokens = 130_000
|
||||
# A real, larger usage figure is already tracked.
|
||||
agent.context_compressor.last_prompt_tokens = 160_000
|
||||
|
||||
big_history = []
|
||||
for i in range(20):
|
||||
big_history.append({"role": "user", "content": f"Message {i} padded text"})
|
||||
big_history.append({"role": "assistant", "content": f"Response {i} padded text"})
|
||||
|
||||
ok_resp = _mock_response(content="After preflight", finish_reason="stop")
|
||||
agent.client.chat.completions.create.side_effect = [ok_resp]
|
||||
|
||||
with (
|
||||
patch("agent.conversation_loop.estimate_request_tokens_rough", return_value=144_669),
|
||||
patch.object(agent, "_compress_context", side_effect=lambda msgs, *a, **k: (msgs, agent._cached_system_prompt)),
|
||||
patch.object(agent, "_persist_session"),
|
||||
patch.object(agent, "_save_trajectory"),
|
||||
patch.object(agent, "_cleanup_task_resources"),
|
||||
):
|
||||
agent.run_conversation("hello", conversation_history=big_history)
|
||||
|
||||
# Smaller estimate must not overwrite the larger tracked value.
|
||||
assert agent.context_compressor.last_prompt_tokens == 160_000
|
||||
|
||||
|
||||
class TestToolResultPreflightCompression:
|
||||
"""Compression should trigger when tool results push context past the threshold."""
|
||||
|
||||
Reference in New Issue
Block a user