fix(compression): avoid repeat preflight compaction from rough estimates

This commit is contained in:
helix4u
2026-05-29 14:45:53 -06:00
committed by Teknium
parent 04de307d62
commit e38b0b55d1
6 changed files with 193 additions and 14 deletions

View File

@ -518,6 +518,10 @@ class ContextCompressor(ContextEngine):
self._last_compression_savings_pct = 100.0
self._ineffective_compression_count = 0
self._summary_failure_cooldown_until = 0.0 # transient errors must not block a fresh session
self.last_real_prompt_tokens = 0
self.last_compression_rough_tokens = 0
self.last_rough_tokens_when_real_prompt_fit = 0
self.awaiting_real_usage_after_compression = False
def update_model(
self,
@ -615,6 +619,10 @@ class ContextCompressor(ContextEngine):
self.last_prompt_tokens = 0
self.last_completion_tokens = 0
self.last_real_prompt_tokens = 0
self.last_compression_rough_tokens = 0
self.last_rough_tokens_when_real_prompt_fit = 0
self.awaiting_real_usage_after_compression = False
self.summary_model = summary_model_override or ""
@ -648,6 +656,44 @@ class ContextCompressor(ContextEngine):
self.last_prompt_tokens = usage.get("prompt_tokens", 0)
self.last_completion_tokens = usage.get("completion_tokens", 0)
self.last_total_tokens = usage.get("total_tokens", self.last_prompt_tokens + self.last_completion_tokens)
if self.last_prompt_tokens > 0:
self.last_real_prompt_tokens = self.last_prompt_tokens
if self.last_prompt_tokens < self.threshold_tokens:
if self.awaiting_real_usage_after_compression and self.last_compression_rough_tokens > 0:
self.last_rough_tokens_when_real_prompt_fit = self.last_compression_rough_tokens
else:
self.last_rough_tokens_when_real_prompt_fit = 0
self.awaiting_real_usage_after_compression = False
def should_defer_preflight_to_real_usage(self, rough_tokens: int) -> bool:
"""Return True when a high rough preflight estimate is known-noisy.
``estimate_request_tokens_rough(..., tools=...)`` intentionally
overestimates schema-heavy requests so Hermes compresses before a
provider rejects the payload. After a successful compressed API call,
though, provider ``prompt_tokens`` are a better signal than repeating
compaction from the same rough schema overhead. Defer only while the
rough estimate has grown modestly since a request the provider proved
fit under the threshold.
"""
if rough_tokens < self.threshold_tokens:
return False
if self.last_real_prompt_tokens <= 0:
return False
if self.last_real_prompt_tokens >= self.threshold_tokens:
return False
baseline = self.last_rough_tokens_when_real_prompt_fit or self.last_compression_rough_tokens
if baseline <= 0:
return False
growth = max(0, rough_tokens - baseline)
tolerated_growth = max(4096, int(self.threshold_tokens * 0.05))
if growth > tolerated_growth:
return False
self.last_rough_tokens_when_real_prompt_fit = max(baseline, rough_tokens)
return True
def should_compress(self, prompt_tokens: int = None) -> bool:
"""Check if context exceeds the compression threshold.

View File

@ -115,6 +115,15 @@ class ContextEngine(ABC):
"""
return False
def should_defer_preflight_to_real_usage(self, rough_tokens: int) -> bool:
"""Return True when preflight should trust recent real usage instead.
Built-in compression uses this to avoid re-compacting from known-noisy
rough estimates after a compressed request has already fit. Third-party
engines can ignore it safely.
"""
return False
# -- Optional: manual /compress preflight ------------------------------
def has_content_to_compress(self, messages: List[Dict[str, Any]]) -> bool:

View File

@ -575,19 +575,18 @@ def compress_context(
force=True,
)
# Update token estimate after compaction so pressure calculations
# use the post-compression count, not the stale pre-compression one.
# Use estimate_request_tokens_rough() so tool schemas are included —
# with 50+ tools enabled, schemas alone can add 20-30K tokens, and
# omitting them delays the next compression cycle far past the
# configured threshold (issue #14695).
# Keep the post-compression rough estimate for diagnostics, but do not
# treat it as provider-reported prompt usage. Schema-heavy rough estimates
# can remain above threshold even after the next real API request fits.
_compressed_est = estimate_request_tokens_rough(
compressed,
system_prompt=new_system_prompt or "",
tools=agent.tools or None,
)
agent.context_compressor.last_prompt_tokens = _compressed_est
agent.context_compressor.last_compression_rough_tokens = _compressed_est
agent.context_compressor.last_prompt_tokens = -1
agent.context_compressor.last_completion_tokens = 0
agent.context_compressor.awaiting_real_usage_after_compression = True
# Clear the file-read dedup cache. After compression the original
# read content is summarised away — if the model re-reads the same
@ -599,7 +598,7 @@ def compress_context(
pass
logger.info(
"context compression done: session=%s messages=%d->%d tokens=~%s",
"context compression done: session=%s messages=%d->%d rough_tokens=~%s awaiting_real_usage=true",
agent.session_id or "none", _pre_msg_count, len(compressed),
f"{_compressed_est:,}",
)

View File

@ -600,18 +600,32 @@ def run_conversation(
system_prompt=active_system_prompt or "",
tools=agent.tools or None,
)
_compressor = agent.context_compressor
_defer_preflight = getattr(
_compressor,
"should_defer_preflight_to_real_usage",
lambda _tokens: False,
)
if agent.context_compressor.should_compress(_preflight_tokens):
if _defer_preflight(_preflight_tokens):
logger.info(
"Skipping preflight compression: rough estimate ~%s >= %s, "
"but last real provider prompt was %s after compression",
f"{_preflight_tokens:,}",
f"{_compressor.threshold_tokens:,}",
f"{_compressor.last_real_prompt_tokens:,}",
)
elif _compressor.should_compress(_preflight_tokens):
logger.info(
"Preflight compression: ~%s tokens >= %s threshold (model %s, ctx %s)",
f"{_preflight_tokens:,}",
f"{agent.context_compressor.threshold_tokens:,}",
f"{_compressor.threshold_tokens:,}",
agent.model,
f"{agent.context_compressor.context_length:,}",
f"{_compressor.context_length:,}",
)
agent._emit_status(
f"📦 Preflight compression: ~{_preflight_tokens:,} tokens "
f">= {agent.context_compressor.threshold_tokens:,} threshold. "
f">= {_compressor.threshold_tokens:,} threshold. "
"This may take a moment."
)
# May need multiple passes for very large sessions with small
@ -646,8 +660,8 @@ def run_conversation(
system_prompt=active_system_prompt or "",
tools=agent.tools or None,
)
if _preflight_tokens < agent.context_compressor.threshold_tokens:
break # Under threshold
if not _compressor.should_compress(_preflight_tokens):
break # Under threshold or anti-thrash guard stopped it
# Plugin hook: pre_llm_call
# Fired once per turn before the tool-calling loop. Plugins can
@ -3862,6 +3876,11 @@ def run_conversation(
# inflate completion_tokens with reasoning,
# causing premature compression. (#12026)
_real_tokens = _compressor.last_prompt_tokens
elif _compressor.last_prompt_tokens == -1:
# Compression just ran and no API-reported prompt count
# has arrived yet. Avoid treating a schema-heavy rough
# post-compression estimate as real context pressure.
_real_tokens = 0
else:
# Include tool schemas — with 50+ tools enabled
# these add 20-30K tokens the messages-only

View File

@ -41,6 +41,8 @@ class TestShouldCompress:
class TestUpdateFromResponse:
def test_updates_fields(self, compressor):
compressor.awaiting_real_usage_after_compression = True
compressor.last_compression_rough_tokens = 90_000
compressor.update_from_response({
"prompt_tokens": 5000,
"completion_tokens": 1000,
@ -48,12 +50,39 @@ class TestUpdateFromResponse:
})
assert compressor.last_prompt_tokens == 5000
assert compressor.last_completion_tokens == 1000
assert compressor.last_real_prompt_tokens == 5000
assert compressor.last_rough_tokens_when_real_prompt_fit == 90_000
assert compressor.awaiting_real_usage_after_compression is False
def test_missing_fields_default_zero(self, compressor):
compressor.update_from_response({})
assert compressor.last_prompt_tokens == 0
class TestPreflightDeferral:
def test_defers_when_recent_real_usage_fit_and_rough_growth_is_small(self, compressor):
compressor.threshold_tokens = 85_000
compressor.last_real_prompt_tokens = 50_000
compressor.last_rough_tokens_when_real_prompt_fit = 90_000
assert compressor.should_defer_preflight_to_real_usage(93_000) is True
assert compressor.last_rough_tokens_when_real_prompt_fit == 93_000
def test_does_not_defer_when_rough_growth_is_large(self, compressor):
compressor.threshold_tokens = 85_000
compressor.last_real_prompt_tokens = 50_000
compressor.last_rough_tokens_when_real_prompt_fit = 90_000
assert compressor.should_defer_preflight_to_real_usage(100_000) is False
def test_does_not_defer_without_recent_real_usage(self, compressor):
compressor.threshold_tokens = 85_000
compressor.last_real_prompt_tokens = 0
compressor.last_rough_tokens_when_real_prompt_fit = 90_000
assert compressor.should_defer_preflight_to_real_usage(93_000) is False
class TestCompress:
def _make_messages(self, n):

View File

@ -491,6 +491,83 @@ class TestPreflightCompression:
for ev, msg in status_messages
)
def test_preflight_defers_when_recent_real_usage_fit(self, agent):
"""A noisy rough estimate should not re-compact a recently fitting request."""
agent.compression_enabled = True
agent.context_compressor.context_length = 200_000
agent.context_compressor.threshold_tokens = 100_000
agent.context_compressor.last_prompt_tokens = 58_000
agent.context_compressor.last_real_prompt_tokens = 58_000
agent.context_compressor.last_rough_tokens_when_real_prompt_fit = 113_000
big_history = []
for i in range(20):
big_history.append({"role": "user", "content": f"Message {i} padded"})
big_history.append({"role": "assistant", "content": f"Response {i} padded"})
ok_resp = _mock_response(
content="Used real fit",
finish_reason="stop",
usage={"prompt_tokens": 59_000, "completion_tokens": 100, "total_tokens": 59_100},
)
agent.client.chat.completions.create.side_effect = [ok_resp]
status_messages = []
agent.status_callback = lambda ev, msg: status_messages.append((ev, msg))
with (
patch("agent.conversation_loop.estimate_request_tokens_rough", return_value=114_000),
patch.object(agent, "_compress_context") as mock_compress,
patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"),
patch.object(agent, "_cleanup_task_resources"),
):
result = agent.run_conversation("hello", conversation_history=big_history)
mock_compress.assert_not_called()
assert result["completed"] is True
assert result["final_response"] == "Used real fit"
assert not any(
ev == "lifecycle" and "Preflight compression" in msg
for ev, msg in status_messages
)
def test_preflight_compresses_when_rough_growth_after_fit_is_large(self, agent):
"""Large rough growth after a fitting request still triggers preflight."""
agent.compression_enabled = True
agent.context_compressor.context_length = 200_000
agent.context_compressor.threshold_tokens = 100_000
agent.context_compressor.last_prompt_tokens = 58_000
agent.context_compressor.last_real_prompt_tokens = 58_000
agent.context_compressor.last_rough_tokens_when_real_prompt_fit = 113_000
big_history = []
for i in range(20):
big_history.append({"role": "user", "content": f"Message {i} padded"})
big_history.append({"role": "assistant", "content": f"Response {i} padded"})
ok_resp = _mock_response(
content="Compressed after growth",
finish_reason="stop",
usage={"prompt_tokens": 50_000, "completion_tokens": 100, "total_tokens": 50_100},
)
agent.client.chat.completions.create.side_effect = [ok_resp]
with (
patch("agent.conversation_loop.estimate_request_tokens_rough", side_effect=[125_000, 40_000]),
patch.object(agent, "_compress_context") as mock_compress,
patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"),
patch.object(agent, "_cleanup_task_resources"),
):
mock_compress.return_value = (
[{"role": "user", "content": f"{SUMMARY_PREFIX}\nPrevious conversation"}],
"new system prompt",
)
result = agent.run_conversation("hello", conversation_history=big_history)
mock_compress.assert_called_once()
assert result["completed"] is True
def test_no_preflight_when_under_threshold(self, agent):
"""When history fits within context, no preflight compression needed."""
agent.compression_enabled = True