From 897f9533ed511345d0a729af507abdb2308cfbcb Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Fri, 29 May 2026 19:21:15 -0700
Subject: [PATCH] fix: keep CLI context display in sync with preflight token
 estimate (#35079)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Inspired by Claude Code: /compress here [N] — boundary-aware 'summarize up to here'

Adds a user-chosen compression boundary to the existing /compress command.
/compress here [N] summarizes everything except the most recent N exchanges
(default 2), which are preserved verbatim — letting the user pick the
compression boundary instead of relying on the automatic token-budget heuristic.

Inspired by Claude Code's Rewind 'Summarize up to here' action (v2.1.139,
Week 20, May 2026): https://code.claude.com/docs/en/whats-new/2026-w20

- hermes_cli/partial_compress.py: pure split/parse helpers + seam-alternation
  guard (shared by CLI and gateway).
- cli.py / gateway/run.py: route 'here [N]' / '--keep N' to partial compression;
  compress only the head, re-append the verbatim tail through the seam guard.
- Preserves message-flow role alternation (seam guard merges any illegal
  user->user / assistant->assistant adjacency).
- Reuses the existing _compress_context session-rotation/lock machinery — no
  changes to the compression core.
- Bare /compress (full) and /compress <focus> behavior unchanged.

Tests: 12 helper unit tests + 5 CLI integration tests + E2E (interleaved
tool-call transcript, degenerate/multimodal seams, real handler path).

* fix: keep CLI context display in sync with preflight token estimate

The status bar reads compressor.last_prompt_tokens, which only updates
from a successful API response. When loaded history is oversized but
compression no-ops (e.g. the auxiliary summary model times out), no fresh
usage arrives and the bar stays frozen at the old, smaller value while the
preflight estimate reports a much larger number — looking permanently out
of sync (reported: 74.4K display vs ~144,669 preflight).

Seed last_prompt_tokens with the fresh preflight estimate (upward-only, so
a real usage figure is never clobbered and a successful compression's
downward correction still wins). Display-only; no behavioral change to
compression, caching, or the agent loop.
---
 agent/conversation_loop.py              | 20 +++++++-
 tests/run_agent/test_413_compression.py | 68 +++++++++++++++++++++++++
 2 files changed, 87 insertions(+), 1 deletion(-)
diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py
index f72014b9c..e23a513aa 100644
--- a/agent/conversation_loop.py
+++ b/agent/conversation_loop.py
@@ -606,8 +606,26 @@ def run_conversation(
             "should_defer_preflight_to_real_usage",
             lambda _tokens: False,
         )
+        _preflight_deferred = _defer_preflight(_preflight_tokens)
 
-        if _defer_preflight(_preflight_tokens):
+        if not _preflight_deferred:
+            # Keep the CLI/ACP context display in sync with what preflight
+            # actually measured.  The status bar reads
+            # ``compressor.last_prompt_tokens``, which otherwise only updates
+            # from a *successful* API response.  When the conversation has grown
+            # since the last successful call — or when compression then fails
+            # (e.g. the auxiliary summary model times out) and no fresh usage
+            # arrives — the bar stays stuck at the old, smaller value while
+            # preflight reports a much larger number, looking out of sync.
+            # Seed it with the fresh estimate (only ever revising upward; a real
+            # ``update_from_response`` will correct it after the next API call).
+            # Skipped when deferring — a deferred estimate is known to over-count
+            # vs the last real provider prompt, so trusting it for the display
+            # would re-introduce the very desync we're avoiding.
+            if _preflight_tokens > (_compressor.last_prompt_tokens or 0):
+                _compressor.last_prompt_tokens = _preflight_tokens
+
+        if _preflight_deferred:
             logger.info(
                 "Skipping preflight compression: rough estimate ~%s >= %s, "
                 "but last real provider prompt was %s after compression",
diff --git a/tests/run_agent/test_413_compression.py b/tests/run_agent/test_413_compression.py
index a2838d7cf..cadb26c44 100644
--- a/tests/run_agent/test_413_compression.py
+++ b/tests/run_agent/test_413_compression.py
@@ -665,6 +665,74 @@ class TestPreflightCompression:
         mock_compress.assert_not_called()
         assert result["completed"] is True
 
+    def test_preflight_seeds_display_tokens_when_compression_aborts(self, agent):
+        """Display must reflect the real context size even when compression no-ops.
+
+        Regression: the CLI status bar reads ``last_prompt_tokens``, which only
+        updated from a *successful* API response. When the loaded history was
+        oversized but compression failed to reduce it (e.g. the auxiliary
+        summary model timed out), the bar stayed stuck at the old, smaller
+        value while the preflight estimate reported a much larger number —
+        looking permanently out of sync.
+        """
+        agent.compression_enabled = True
+        agent.context_compressor.context_length = 200_000
+        agent.context_compressor.threshold_tokens = 130_000
+        # Simulate a stale display value from an earlier, smaller turn.
+        agent.context_compressor.last_prompt_tokens = 74_400
+
+        big_history = []
+        for i in range(20):
+            big_history.append({"role": "user", "content": f"Message {i} padded text"})
+            big_history.append({"role": "assistant", "content": f"Response {i} padded text"})
+
+        ok_resp = _mock_response(content="After preflight", finish_reason="stop")
+        agent.client.chat.completions.create.side_effect = [ok_resp]
+
+        with (
+            patch("agent.conversation_loop.estimate_request_tokens_rough", return_value=144_669),
+            # Compression no-ops (returns input unchanged) — mirrors an aux
+            # summary-model timeout where the messages can't be reduced.
+            patch.object(agent, "_compress_context", side_effect=lambda msgs, *a, **k: (msgs, agent._cached_system_prompt)),
+            patch.object(agent, "_persist_session"),
+            patch.object(agent, "_save_trajectory"),
+            patch.object(agent, "_cleanup_task_resources"),
+        ):
+            result = agent.run_conversation("hello", conversation_history=big_history)
+
+        assert result["completed"] is True
+        # The display token count was revised up to the fresh preflight estimate,
+        # not left at the stale 74_400.
+        assert agent.context_compressor.last_prompt_tokens == 144_669
+
+    def test_preflight_seed_only_revises_upward(self, agent):
+        """A larger tracked value must not be clobbered by a smaller estimate."""
+        agent.compression_enabled = True
+        agent.context_compressor.context_length = 200_000
+        agent.context_compressor.threshold_tokens = 130_000
+        # A real, larger usage figure is already tracked.
+        agent.context_compressor.last_prompt_tokens = 160_000
+
+        big_history = []
+        for i in range(20):
+            big_history.append({"role": "user", "content": f"Message {i} padded text"})
+            big_history.append({"role": "assistant", "content": f"Response {i} padded text"})
+
+        ok_resp = _mock_response(content="After preflight", finish_reason="stop")
+        agent.client.chat.completions.create.side_effect = [ok_resp]
+
+        with (
+            patch("agent.conversation_loop.estimate_request_tokens_rough", return_value=144_669),
+            patch.object(agent, "_compress_context", side_effect=lambda msgs, *a, **k: (msgs, agent._cached_system_prompt)),
+            patch.object(agent, "_persist_session"),
+            patch.object(agent, "_save_trajectory"),
+            patch.object(agent, "_cleanup_task_resources"),
+        ):
+            agent.run_conversation("hello", conversation_history=big_history)
+
+        # Smaller estimate must not overwrite the larger tracked value.
+        assert agent.context_compressor.last_prompt_tokens == 160_000
+
 
 class TestToolResultPreflightCompression:
     """Compression should trigger when tool results push context past the threshold."""