diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py index f72014b9c..e23a513aa 100644 --- a/agent/conversation_loop.py +++ b/agent/conversation_loop.py @@ -606,8 +606,26 @@ def run_conversation( "should_defer_preflight_to_real_usage", lambda _tokens: False, ) + _preflight_deferred = _defer_preflight(_preflight_tokens) - if _defer_preflight(_preflight_tokens): + if not _preflight_deferred: + # Keep the CLI/ACP context display in sync with what preflight + # actually measured. The status bar reads + # ``compressor.last_prompt_tokens``, which otherwise only updates + # from a *successful* API response. When the conversation has grown + # since the last successful call — or when compression then fails + # (e.g. the auxiliary summary model times out) and no fresh usage + # arrives — the bar stays stuck at the old, smaller value while + # preflight reports a much larger number, looking out of sync. + # Seed it with the fresh estimate (only ever revising upward; a real + # ``update_from_response`` will correct it after the next API call). + # Skipped when deferring — a deferred estimate is known to over-count + # vs the last real provider prompt, so trusting it for the display + # would re-introduce the very desync we're avoiding. + if _preflight_tokens > (_compressor.last_prompt_tokens or 0): + _compressor.last_prompt_tokens = _preflight_tokens + + if _preflight_deferred: logger.info( "Skipping preflight compression: rough estimate ~%s >= %s, " "but last real provider prompt was %s after compression", diff --git a/tests/run_agent/test_413_compression.py b/tests/run_agent/test_413_compression.py index a2838d7cf..cadb26c44 100644 --- a/tests/run_agent/test_413_compression.py +++ b/tests/run_agent/test_413_compression.py @@ -665,6 +665,74 @@ class TestPreflightCompression: mock_compress.assert_not_called() assert result["completed"] is True + def test_preflight_seeds_display_tokens_when_compression_aborts(self, agent): + """Display must reflect the real context size even when compression no-ops. + + Regression: the CLI status bar reads ``last_prompt_tokens``, which only + updated from a *successful* API response. When the loaded history was + oversized but compression failed to reduce it (e.g. the auxiliary + summary model timed out), the bar stayed stuck at the old, smaller + value while the preflight estimate reported a much larger number — + looking permanently out of sync. + """ + agent.compression_enabled = True + agent.context_compressor.context_length = 200_000 + agent.context_compressor.threshold_tokens = 130_000 + # Simulate a stale display value from an earlier, smaller turn. + agent.context_compressor.last_prompt_tokens = 74_400 + + big_history = [] + for i in range(20): + big_history.append({"role": "user", "content": f"Message {i} padded text"}) + big_history.append({"role": "assistant", "content": f"Response {i} padded text"}) + + ok_resp = _mock_response(content="After preflight", finish_reason="stop") + agent.client.chat.completions.create.side_effect = [ok_resp] + + with ( + patch("agent.conversation_loop.estimate_request_tokens_rough", return_value=144_669), + # Compression no-ops (returns input unchanged) — mirrors an aux + # summary-model timeout where the messages can't be reduced. + patch.object(agent, "_compress_context", side_effect=lambda msgs, *a, **k: (msgs, agent._cached_system_prompt)), + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + result = agent.run_conversation("hello", conversation_history=big_history) + + assert result["completed"] is True + # The display token count was revised up to the fresh preflight estimate, + # not left at the stale 74_400. + assert agent.context_compressor.last_prompt_tokens == 144_669 + + def test_preflight_seed_only_revises_upward(self, agent): + """A larger tracked value must not be clobbered by a smaller estimate.""" + agent.compression_enabled = True + agent.context_compressor.context_length = 200_000 + agent.context_compressor.threshold_tokens = 130_000 + # A real, larger usage figure is already tracked. + agent.context_compressor.last_prompt_tokens = 160_000 + + big_history = [] + for i in range(20): + big_history.append({"role": "user", "content": f"Message {i} padded text"}) + big_history.append({"role": "assistant", "content": f"Response {i} padded text"}) + + ok_resp = _mock_response(content="After preflight", finish_reason="stop") + agent.client.chat.completions.create.side_effect = [ok_resp] + + with ( + patch("agent.conversation_loop.estimate_request_tokens_rough", return_value=144_669), + patch.object(agent, "_compress_context", side_effect=lambda msgs, *a, **k: (msgs, agent._cached_system_prompt)), + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + agent.run_conversation("hello", conversation_history=big_history) + + # Smaller estimate must not overwrite the larger tracked value. + assert agent.context_compressor.last_prompt_tokens == 160_000 + class TestToolResultPreflightCompression: """Compression should trigger when tool results push context past the threshold."""