From e38b0b55d12cfa39a6ac71d553d224c0711856f2 Mon Sep 17 00:00:00 2001 From: helix4u <4317663+helix4u@users.noreply.github.com> Date: Fri, 29 May 2026 14:45:53 -0600 Subject: [PATCH] fix(compression): avoid repeat preflight compaction from rough estimates --- agent/context_compressor.py | 46 +++++++++++++++ agent/context_engine.py | 9 +++ agent/conversation_compression.py | 15 +++-- agent/conversation_loop.py | 31 ++++++++-- tests/agent/test_context_compressor.py | 29 ++++++++++ tests/run_agent/test_413_compression.py | 77 +++++++++++++++++++++++++ 6 files changed, 193 insertions(+), 14 deletions(-) diff --git a/agent/context_compressor.py b/agent/context_compressor.py index 58829dbf4..cf9c534de 100644 --- a/agent/context_compressor.py +++ b/agent/context_compressor.py @@ -518,6 +518,10 @@ class ContextCompressor(ContextEngine): self._last_compression_savings_pct = 100.0 self._ineffective_compression_count = 0 self._summary_failure_cooldown_until = 0.0 # transient errors must not block a fresh session + self.last_real_prompt_tokens = 0 + self.last_compression_rough_tokens = 0 + self.last_rough_tokens_when_real_prompt_fit = 0 + self.awaiting_real_usage_after_compression = False def update_model( self, @@ -615,6 +619,10 @@ class ContextCompressor(ContextEngine): self.last_prompt_tokens = 0 self.last_completion_tokens = 0 + self.last_real_prompt_tokens = 0 + self.last_compression_rough_tokens = 0 + self.last_rough_tokens_when_real_prompt_fit = 0 + self.awaiting_real_usage_after_compression = False self.summary_model = summary_model_override or "" @@ -648,6 +656,44 @@ class ContextCompressor(ContextEngine): self.last_prompt_tokens = usage.get("prompt_tokens", 0) self.last_completion_tokens = usage.get("completion_tokens", 0) self.last_total_tokens = usage.get("total_tokens", self.last_prompt_tokens + self.last_completion_tokens) + if self.last_prompt_tokens > 0: + self.last_real_prompt_tokens = self.last_prompt_tokens + if self.last_prompt_tokens < self.threshold_tokens: + if self.awaiting_real_usage_after_compression and self.last_compression_rough_tokens > 0: + self.last_rough_tokens_when_real_prompt_fit = self.last_compression_rough_tokens + else: + self.last_rough_tokens_when_real_prompt_fit = 0 + self.awaiting_real_usage_after_compression = False + + def should_defer_preflight_to_real_usage(self, rough_tokens: int) -> bool: + """Return True when a high rough preflight estimate is known-noisy. + + ``estimate_request_tokens_rough(..., tools=...)`` intentionally + overestimates schema-heavy requests so Hermes compresses before a + provider rejects the payload. After a successful compressed API call, + though, provider ``prompt_tokens`` are a better signal than repeating + compaction from the same rough schema overhead. Defer only while the + rough estimate has grown modestly since a request the provider proved + fit under the threshold. + """ + if rough_tokens < self.threshold_tokens: + return False + if self.last_real_prompt_tokens <= 0: + return False + if self.last_real_prompt_tokens >= self.threshold_tokens: + return False + + baseline = self.last_rough_tokens_when_real_prompt_fit or self.last_compression_rough_tokens + if baseline <= 0: + return False + + growth = max(0, rough_tokens - baseline) + tolerated_growth = max(4096, int(self.threshold_tokens * 0.05)) + if growth > tolerated_growth: + return False + + self.last_rough_tokens_when_real_prompt_fit = max(baseline, rough_tokens) + return True def should_compress(self, prompt_tokens: int = None) -> bool: """Check if context exceeds the compression threshold. diff --git a/agent/context_engine.py b/agent/context_engine.py index bb426fc18..79c31fb48 100644 --- a/agent/context_engine.py +++ b/agent/context_engine.py @@ -115,6 +115,15 @@ class ContextEngine(ABC): """ return False + def should_defer_preflight_to_real_usage(self, rough_tokens: int) -> bool: + """Return True when preflight should trust recent real usage instead. + + Built-in compression uses this to avoid re-compacting from known-noisy + rough estimates after a compressed request has already fit. Third-party + engines can ignore it safely. + """ + return False + # -- Optional: manual /compress preflight ------------------------------ def has_content_to_compress(self, messages: List[Dict[str, Any]]) -> bool: diff --git a/agent/conversation_compression.py b/agent/conversation_compression.py index 9a93ba4a4..ba8678cc7 100644 --- a/agent/conversation_compression.py +++ b/agent/conversation_compression.py @@ -575,19 +575,18 @@ def compress_context( force=True, ) - # Update token estimate after compaction so pressure calculations - # use the post-compression count, not the stale pre-compression one. - # Use estimate_request_tokens_rough() so tool schemas are included — - # with 50+ tools enabled, schemas alone can add 20-30K tokens, and - # omitting them delays the next compression cycle far past the - # configured threshold (issue #14695). + # Keep the post-compression rough estimate for diagnostics, but do not + # treat it as provider-reported prompt usage. Schema-heavy rough estimates + # can remain above threshold even after the next real API request fits. _compressed_est = estimate_request_tokens_rough( compressed, system_prompt=new_system_prompt or "", tools=agent.tools or None, ) - agent.context_compressor.last_prompt_tokens = _compressed_est + agent.context_compressor.last_compression_rough_tokens = _compressed_est + agent.context_compressor.last_prompt_tokens = -1 agent.context_compressor.last_completion_tokens = 0 + agent.context_compressor.awaiting_real_usage_after_compression = True # Clear the file-read dedup cache. After compression the original # read content is summarised away — if the model re-reads the same @@ -599,7 +598,7 @@ def compress_context( pass logger.info( - "context compression done: session=%s messages=%d->%d tokens=~%s", + "context compression done: session=%s messages=%d->%d rough_tokens=~%s awaiting_real_usage=true", agent.session_id or "none", _pre_msg_count, len(compressed), f"{_compressed_est:,}", ) diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py index a6c975be3..f72014b9c 100644 --- a/agent/conversation_loop.py +++ b/agent/conversation_loop.py @@ -600,18 +600,32 @@ def run_conversation( system_prompt=active_system_prompt or "", tools=agent.tools or None, ) + _compressor = agent.context_compressor + _defer_preflight = getattr( + _compressor, + "should_defer_preflight_to_real_usage", + lambda _tokens: False, + ) - if agent.context_compressor.should_compress(_preflight_tokens): + if _defer_preflight(_preflight_tokens): + logger.info( + "Skipping preflight compression: rough estimate ~%s >= %s, " + "but last real provider prompt was %s after compression", + f"{_preflight_tokens:,}", + f"{_compressor.threshold_tokens:,}", + f"{_compressor.last_real_prompt_tokens:,}", + ) + elif _compressor.should_compress(_preflight_tokens): logger.info( "Preflight compression: ~%s tokens >= %s threshold (model %s, ctx %s)", f"{_preflight_tokens:,}", - f"{agent.context_compressor.threshold_tokens:,}", + f"{_compressor.threshold_tokens:,}", agent.model, - f"{agent.context_compressor.context_length:,}", + f"{_compressor.context_length:,}", ) agent._emit_status( f"📦 Preflight compression: ~{_preflight_tokens:,} tokens " - f">= {agent.context_compressor.threshold_tokens:,} threshold. " + f">= {_compressor.threshold_tokens:,} threshold. " "This may take a moment." ) # May need multiple passes for very large sessions with small @@ -646,8 +660,8 @@ def run_conversation( system_prompt=active_system_prompt or "", tools=agent.tools or None, ) - if _preflight_tokens < agent.context_compressor.threshold_tokens: - break # Under threshold + if not _compressor.should_compress(_preflight_tokens): + break # Under threshold or anti-thrash guard stopped it # Plugin hook: pre_llm_call # Fired once per turn before the tool-calling loop. Plugins can @@ -3862,6 +3876,11 @@ def run_conversation( # inflate completion_tokens with reasoning, # causing premature compression. (#12026) _real_tokens = _compressor.last_prompt_tokens + elif _compressor.last_prompt_tokens == -1: + # Compression just ran and no API-reported prompt count + # has arrived yet. Avoid treating a schema-heavy rough + # post-compression estimate as real context pressure. + _real_tokens = 0 else: # Include tool schemas — with 50+ tools enabled # these add 20-30K tokens the messages-only diff --git a/tests/agent/test_context_compressor.py b/tests/agent/test_context_compressor.py index 0d7aa81f4..5ce753864 100644 --- a/tests/agent/test_context_compressor.py +++ b/tests/agent/test_context_compressor.py @@ -41,6 +41,8 @@ class TestShouldCompress: class TestUpdateFromResponse: def test_updates_fields(self, compressor): + compressor.awaiting_real_usage_after_compression = True + compressor.last_compression_rough_tokens = 90_000 compressor.update_from_response({ "prompt_tokens": 5000, "completion_tokens": 1000, @@ -48,12 +50,39 @@ class TestUpdateFromResponse: }) assert compressor.last_prompt_tokens == 5000 assert compressor.last_completion_tokens == 1000 + assert compressor.last_real_prompt_tokens == 5000 + assert compressor.last_rough_tokens_when_real_prompt_fit == 90_000 + assert compressor.awaiting_real_usage_after_compression is False def test_missing_fields_default_zero(self, compressor): compressor.update_from_response({}) assert compressor.last_prompt_tokens == 0 +class TestPreflightDeferral: + def test_defers_when_recent_real_usage_fit_and_rough_growth_is_small(self, compressor): + compressor.threshold_tokens = 85_000 + compressor.last_real_prompt_tokens = 50_000 + compressor.last_rough_tokens_when_real_prompt_fit = 90_000 + + assert compressor.should_defer_preflight_to_real_usage(93_000) is True + assert compressor.last_rough_tokens_when_real_prompt_fit == 93_000 + + def test_does_not_defer_when_rough_growth_is_large(self, compressor): + compressor.threshold_tokens = 85_000 + compressor.last_real_prompt_tokens = 50_000 + compressor.last_rough_tokens_when_real_prompt_fit = 90_000 + + assert compressor.should_defer_preflight_to_real_usage(100_000) is False + + def test_does_not_defer_without_recent_real_usage(self, compressor): + compressor.threshold_tokens = 85_000 + compressor.last_real_prompt_tokens = 0 + compressor.last_rough_tokens_when_real_prompt_fit = 90_000 + + assert compressor.should_defer_preflight_to_real_usage(93_000) is False + + class TestCompress: def _make_messages(self, n): diff --git a/tests/run_agent/test_413_compression.py b/tests/run_agent/test_413_compression.py index 6695d6c27..37cafa798 100644 --- a/tests/run_agent/test_413_compression.py +++ b/tests/run_agent/test_413_compression.py @@ -491,6 +491,83 @@ class TestPreflightCompression: for ev, msg in status_messages ) + def test_preflight_defers_when_recent_real_usage_fit(self, agent): + """A noisy rough estimate should not re-compact a recently fitting request.""" + agent.compression_enabled = True + agent.context_compressor.context_length = 200_000 + agent.context_compressor.threshold_tokens = 100_000 + agent.context_compressor.last_prompt_tokens = 58_000 + agent.context_compressor.last_real_prompt_tokens = 58_000 + agent.context_compressor.last_rough_tokens_when_real_prompt_fit = 113_000 + + big_history = [] + for i in range(20): + big_history.append({"role": "user", "content": f"Message {i} padded"}) + big_history.append({"role": "assistant", "content": f"Response {i} padded"}) + + ok_resp = _mock_response( + content="Used real fit", + finish_reason="stop", + usage={"prompt_tokens": 59_000, "completion_tokens": 100, "total_tokens": 59_100}, + ) + agent.client.chat.completions.create.side_effect = [ok_resp] + status_messages = [] + agent.status_callback = lambda ev, msg: status_messages.append((ev, msg)) + + with ( + patch("agent.conversation_loop.estimate_request_tokens_rough", return_value=114_000), + patch.object(agent, "_compress_context") as mock_compress, + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + result = agent.run_conversation("hello", conversation_history=big_history) + + mock_compress.assert_not_called() + assert result["completed"] is True + assert result["final_response"] == "Used real fit" + assert not any( + ev == "lifecycle" and "Preflight compression" in msg + for ev, msg in status_messages + ) + + def test_preflight_compresses_when_rough_growth_after_fit_is_large(self, agent): + """Large rough growth after a fitting request still triggers preflight.""" + agent.compression_enabled = True + agent.context_compressor.context_length = 200_000 + agent.context_compressor.threshold_tokens = 100_000 + agent.context_compressor.last_prompt_tokens = 58_000 + agent.context_compressor.last_real_prompt_tokens = 58_000 + agent.context_compressor.last_rough_tokens_when_real_prompt_fit = 113_000 + + big_history = [] + for i in range(20): + big_history.append({"role": "user", "content": f"Message {i} padded"}) + big_history.append({"role": "assistant", "content": f"Response {i} padded"}) + + ok_resp = _mock_response( + content="Compressed after growth", + finish_reason="stop", + usage={"prompt_tokens": 50_000, "completion_tokens": 100, "total_tokens": 50_100}, + ) + agent.client.chat.completions.create.side_effect = [ok_resp] + + with ( + patch("agent.conversation_loop.estimate_request_tokens_rough", side_effect=[125_000, 40_000]), + patch.object(agent, "_compress_context") as mock_compress, + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + mock_compress.return_value = ( + [{"role": "user", "content": f"{SUMMARY_PREFIX}\nPrevious conversation"}], + "new system prompt", + ) + result = agent.run_conversation("hello", conversation_history=big_history) + + mock_compress.assert_called_once() + assert result["completed"] is True + def test_no_preflight_when_under_threshold(self, agent): """When history fits within context, no preflight compression needed.""" agent.compression_enabled = True