From 897f9533ed511345d0a729af507abdb2308cfbcb Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Fri, 29 May 2026 19:21:15 -0700 Subject: [PATCH] fix: keep CLI context display in sync with preflight token estimate (#35079) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Inspired by Claude Code: /compress here [N] — boundary-aware 'summarize up to here' Adds a user-chosen compression boundary to the existing /compress command. /compress here [N] summarizes everything except the most recent N exchanges (default 2), which are preserved verbatim — letting the user pick the compression boundary instead of relying on the automatic token-budget heuristic. Inspired by Claude Code's Rewind 'Summarize up to here' action (v2.1.139, Week 20, May 2026): https://code.claude.com/docs/en/whats-new/2026-w20 - hermes_cli/partial_compress.py: pure split/parse helpers + seam-alternation guard (shared by CLI and gateway). - cli.py / gateway/run.py: route 'here [N]' / '--keep N' to partial compression; compress only the head, re-append the verbatim tail through the seam guard. - Preserves message-flow role alternation (seam guard merges any illegal user->user / assistant->assistant adjacency). - Reuses the existing _compress_context session-rotation/lock machinery — no changes to the compression core. - Bare /compress (full) and /compress behavior unchanged. Tests: 12 helper unit tests + 5 CLI integration tests + E2E (interleaved tool-call transcript, degenerate/multimodal seams, real handler path). * fix: keep CLI context display in sync with preflight token estimate The status bar reads compressor.last_prompt_tokens, which only updates from a successful API response. When loaded history is oversized but compression no-ops (e.g. the auxiliary summary model times out), no fresh usage arrives and the bar stays frozen at the old, smaller value while the preflight estimate reports a much larger number — looking permanently out of sync (reported: 74.4K display vs ~144,669 preflight). Seed last_prompt_tokens with the fresh preflight estimate (upward-only, so a real usage figure is never clobbered and a successful compression's downward correction still wins). Display-only; no behavioral change to compression, caching, or the agent loop. --- agent/conversation_loop.py | 20 +++++++- tests/run_agent/test_413_compression.py | 68 +++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 1 deletion(-) diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py index f72014b9c..e23a513aa 100644 --- a/agent/conversation_loop.py +++ b/agent/conversation_loop.py @@ -606,8 +606,26 @@ def run_conversation( "should_defer_preflight_to_real_usage", lambda _tokens: False, ) + _preflight_deferred = _defer_preflight(_preflight_tokens) - if _defer_preflight(_preflight_tokens): + if not _preflight_deferred: + # Keep the CLI/ACP context display in sync with what preflight + # actually measured. The status bar reads + # ``compressor.last_prompt_tokens``, which otherwise only updates + # from a *successful* API response. When the conversation has grown + # since the last successful call — or when compression then fails + # (e.g. the auxiliary summary model times out) and no fresh usage + # arrives — the bar stays stuck at the old, smaller value while + # preflight reports a much larger number, looking out of sync. + # Seed it with the fresh estimate (only ever revising upward; a real + # ``update_from_response`` will correct it after the next API call). + # Skipped when deferring — a deferred estimate is known to over-count + # vs the last real provider prompt, so trusting it for the display + # would re-introduce the very desync we're avoiding. + if _preflight_tokens > (_compressor.last_prompt_tokens or 0): + _compressor.last_prompt_tokens = _preflight_tokens + + if _preflight_deferred: logger.info( "Skipping preflight compression: rough estimate ~%s >= %s, " "but last real provider prompt was %s after compression", diff --git a/tests/run_agent/test_413_compression.py b/tests/run_agent/test_413_compression.py index a2838d7cf..cadb26c44 100644 --- a/tests/run_agent/test_413_compression.py +++ b/tests/run_agent/test_413_compression.py @@ -665,6 +665,74 @@ class TestPreflightCompression: mock_compress.assert_not_called() assert result["completed"] is True + def test_preflight_seeds_display_tokens_when_compression_aborts(self, agent): + """Display must reflect the real context size even when compression no-ops. + + Regression: the CLI status bar reads ``last_prompt_tokens``, which only + updated from a *successful* API response. When the loaded history was + oversized but compression failed to reduce it (e.g. the auxiliary + summary model timed out), the bar stayed stuck at the old, smaller + value while the preflight estimate reported a much larger number — + looking permanently out of sync. + """ + agent.compression_enabled = True + agent.context_compressor.context_length = 200_000 + agent.context_compressor.threshold_tokens = 130_000 + # Simulate a stale display value from an earlier, smaller turn. + agent.context_compressor.last_prompt_tokens = 74_400 + + big_history = [] + for i in range(20): + big_history.append({"role": "user", "content": f"Message {i} padded text"}) + big_history.append({"role": "assistant", "content": f"Response {i} padded text"}) + + ok_resp = _mock_response(content="After preflight", finish_reason="stop") + agent.client.chat.completions.create.side_effect = [ok_resp] + + with ( + patch("agent.conversation_loop.estimate_request_tokens_rough", return_value=144_669), + # Compression no-ops (returns input unchanged) — mirrors an aux + # summary-model timeout where the messages can't be reduced. + patch.object(agent, "_compress_context", side_effect=lambda msgs, *a, **k: (msgs, agent._cached_system_prompt)), + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + result = agent.run_conversation("hello", conversation_history=big_history) + + assert result["completed"] is True + # The display token count was revised up to the fresh preflight estimate, + # not left at the stale 74_400. + assert agent.context_compressor.last_prompt_tokens == 144_669 + + def test_preflight_seed_only_revises_upward(self, agent): + """A larger tracked value must not be clobbered by a smaller estimate.""" + agent.compression_enabled = True + agent.context_compressor.context_length = 200_000 + agent.context_compressor.threshold_tokens = 130_000 + # A real, larger usage figure is already tracked. + agent.context_compressor.last_prompt_tokens = 160_000 + + big_history = [] + for i in range(20): + big_history.append({"role": "user", "content": f"Message {i} padded text"}) + big_history.append({"role": "assistant", "content": f"Response {i} padded text"}) + + ok_resp = _mock_response(content="After preflight", finish_reason="stop") + agent.client.chat.completions.create.side_effect = [ok_resp] + + with ( + patch("agent.conversation_loop.estimate_request_tokens_rough", return_value=144_669), + patch.object(agent, "_compress_context", side_effect=lambda msgs, *a, **k: (msgs, agent._cached_system_prompt)), + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + agent.run_conversation("hello", conversation_history=big_history) + + # Smaller estimate must not overwrite the larger tracked value. + assert agent.context_compressor.last_prompt_tokens == 160_000 + class TestToolResultPreflightCompression: """Compression should trigger when tool results push context past the threshold."""