fix(compression): avoid repeat preflight compaction from rough estimates
This commit is contained in:
@ -518,6 +518,10 @@ class ContextCompressor(ContextEngine):
|
||||
self._last_compression_savings_pct = 100.0
|
||||
self._ineffective_compression_count = 0
|
||||
self._summary_failure_cooldown_until = 0.0 # transient errors must not block a fresh session
|
||||
self.last_real_prompt_tokens = 0
|
||||
self.last_compression_rough_tokens = 0
|
||||
self.last_rough_tokens_when_real_prompt_fit = 0
|
||||
self.awaiting_real_usage_after_compression = False
|
||||
|
||||
def update_model(
|
||||
self,
|
||||
@ -615,6 +619,10 @@ class ContextCompressor(ContextEngine):
|
||||
|
||||
self.last_prompt_tokens = 0
|
||||
self.last_completion_tokens = 0
|
||||
self.last_real_prompt_tokens = 0
|
||||
self.last_compression_rough_tokens = 0
|
||||
self.last_rough_tokens_when_real_prompt_fit = 0
|
||||
self.awaiting_real_usage_after_compression = False
|
||||
|
||||
self.summary_model = summary_model_override or ""
|
||||
|
||||
@ -648,6 +656,44 @@ class ContextCompressor(ContextEngine):
|
||||
self.last_prompt_tokens = usage.get("prompt_tokens", 0)
|
||||
self.last_completion_tokens = usage.get("completion_tokens", 0)
|
||||
self.last_total_tokens = usage.get("total_tokens", self.last_prompt_tokens + self.last_completion_tokens)
|
||||
if self.last_prompt_tokens > 0:
|
||||
self.last_real_prompt_tokens = self.last_prompt_tokens
|
||||
if self.last_prompt_tokens < self.threshold_tokens:
|
||||
if self.awaiting_real_usage_after_compression and self.last_compression_rough_tokens > 0:
|
||||
self.last_rough_tokens_when_real_prompt_fit = self.last_compression_rough_tokens
|
||||
else:
|
||||
self.last_rough_tokens_when_real_prompt_fit = 0
|
||||
self.awaiting_real_usage_after_compression = False
|
||||
|
||||
def should_defer_preflight_to_real_usage(self, rough_tokens: int) -> bool:
|
||||
"""Return True when a high rough preflight estimate is known-noisy.
|
||||
|
||||
``estimate_request_tokens_rough(..., tools=...)`` intentionally
|
||||
overestimates schema-heavy requests so Hermes compresses before a
|
||||
provider rejects the payload. After a successful compressed API call,
|
||||
though, provider ``prompt_tokens`` are a better signal than repeating
|
||||
compaction from the same rough schema overhead. Defer only while the
|
||||
rough estimate has grown modestly since a request the provider proved
|
||||
fit under the threshold.
|
||||
"""
|
||||
if rough_tokens < self.threshold_tokens:
|
||||
return False
|
||||
if self.last_real_prompt_tokens <= 0:
|
||||
return False
|
||||
if self.last_real_prompt_tokens >= self.threshold_tokens:
|
||||
return False
|
||||
|
||||
baseline = self.last_rough_tokens_when_real_prompt_fit or self.last_compression_rough_tokens
|
||||
if baseline <= 0:
|
||||
return False
|
||||
|
||||
growth = max(0, rough_tokens - baseline)
|
||||
tolerated_growth = max(4096, int(self.threshold_tokens * 0.05))
|
||||
if growth > tolerated_growth:
|
||||
return False
|
||||
|
||||
self.last_rough_tokens_when_real_prompt_fit = max(baseline, rough_tokens)
|
||||
return True
|
||||
|
||||
def should_compress(self, prompt_tokens: int = None) -> bool:
|
||||
"""Check if context exceeds the compression threshold.
|
||||
|
||||
@ -115,6 +115,15 @@ class ContextEngine(ABC):
|
||||
"""
|
||||
return False
|
||||
|
||||
def should_defer_preflight_to_real_usage(self, rough_tokens: int) -> bool:
|
||||
"""Return True when preflight should trust recent real usage instead.
|
||||
|
||||
Built-in compression uses this to avoid re-compacting from known-noisy
|
||||
rough estimates after a compressed request has already fit. Third-party
|
||||
engines can ignore it safely.
|
||||
"""
|
||||
return False
|
||||
|
||||
# -- Optional: manual /compress preflight ------------------------------
|
||||
|
||||
def has_content_to_compress(self, messages: List[Dict[str, Any]]) -> bool:
|
||||
|
||||
@ -575,19 +575,18 @@ def compress_context(
|
||||
force=True,
|
||||
)
|
||||
|
||||
# Update token estimate after compaction so pressure calculations
|
||||
# use the post-compression count, not the stale pre-compression one.
|
||||
# Use estimate_request_tokens_rough() so tool schemas are included —
|
||||
# with 50+ tools enabled, schemas alone can add 20-30K tokens, and
|
||||
# omitting them delays the next compression cycle far past the
|
||||
# configured threshold (issue #14695).
|
||||
# Keep the post-compression rough estimate for diagnostics, but do not
|
||||
# treat it as provider-reported prompt usage. Schema-heavy rough estimates
|
||||
# can remain above threshold even after the next real API request fits.
|
||||
_compressed_est = estimate_request_tokens_rough(
|
||||
compressed,
|
||||
system_prompt=new_system_prompt or "",
|
||||
tools=agent.tools or None,
|
||||
)
|
||||
agent.context_compressor.last_prompt_tokens = _compressed_est
|
||||
agent.context_compressor.last_compression_rough_tokens = _compressed_est
|
||||
agent.context_compressor.last_prompt_tokens = -1
|
||||
agent.context_compressor.last_completion_tokens = 0
|
||||
agent.context_compressor.awaiting_real_usage_after_compression = True
|
||||
|
||||
# Clear the file-read dedup cache. After compression the original
|
||||
# read content is summarised away — if the model re-reads the same
|
||||
@ -599,7 +598,7 @@ def compress_context(
|
||||
pass
|
||||
|
||||
logger.info(
|
||||
"context compression done: session=%s messages=%d->%d tokens=~%s",
|
||||
"context compression done: session=%s messages=%d->%d rough_tokens=~%s awaiting_real_usage=true",
|
||||
agent.session_id or "none", _pre_msg_count, len(compressed),
|
||||
f"{_compressed_est:,}",
|
||||
)
|
||||
|
||||
@ -600,18 +600,32 @@ def run_conversation(
|
||||
system_prompt=active_system_prompt or "",
|
||||
tools=agent.tools or None,
|
||||
)
|
||||
_compressor = agent.context_compressor
|
||||
_defer_preflight = getattr(
|
||||
_compressor,
|
||||
"should_defer_preflight_to_real_usage",
|
||||
lambda _tokens: False,
|
||||
)
|
||||
|
||||
if agent.context_compressor.should_compress(_preflight_tokens):
|
||||
if _defer_preflight(_preflight_tokens):
|
||||
logger.info(
|
||||
"Skipping preflight compression: rough estimate ~%s >= %s, "
|
||||
"but last real provider prompt was %s after compression",
|
||||
f"{_preflight_tokens:,}",
|
||||
f"{_compressor.threshold_tokens:,}",
|
||||
f"{_compressor.last_real_prompt_tokens:,}",
|
||||
)
|
||||
elif _compressor.should_compress(_preflight_tokens):
|
||||
logger.info(
|
||||
"Preflight compression: ~%s tokens >= %s threshold (model %s, ctx %s)",
|
||||
f"{_preflight_tokens:,}",
|
||||
f"{agent.context_compressor.threshold_tokens:,}",
|
||||
f"{_compressor.threshold_tokens:,}",
|
||||
agent.model,
|
||||
f"{agent.context_compressor.context_length:,}",
|
||||
f"{_compressor.context_length:,}",
|
||||
)
|
||||
agent._emit_status(
|
||||
f"📦 Preflight compression: ~{_preflight_tokens:,} tokens "
|
||||
f">= {agent.context_compressor.threshold_tokens:,} threshold. "
|
||||
f">= {_compressor.threshold_tokens:,} threshold. "
|
||||
"This may take a moment."
|
||||
)
|
||||
# May need multiple passes for very large sessions with small
|
||||
@ -646,8 +660,8 @@ def run_conversation(
|
||||
system_prompt=active_system_prompt or "",
|
||||
tools=agent.tools or None,
|
||||
)
|
||||
if _preflight_tokens < agent.context_compressor.threshold_tokens:
|
||||
break # Under threshold
|
||||
if not _compressor.should_compress(_preflight_tokens):
|
||||
break # Under threshold or anti-thrash guard stopped it
|
||||
|
||||
# Plugin hook: pre_llm_call
|
||||
# Fired once per turn before the tool-calling loop. Plugins can
|
||||
@ -3862,6 +3876,11 @@ def run_conversation(
|
||||
# inflate completion_tokens with reasoning,
|
||||
# causing premature compression. (#12026)
|
||||
_real_tokens = _compressor.last_prompt_tokens
|
||||
elif _compressor.last_prompt_tokens == -1:
|
||||
# Compression just ran and no API-reported prompt count
|
||||
# has arrived yet. Avoid treating a schema-heavy rough
|
||||
# post-compression estimate as real context pressure.
|
||||
_real_tokens = 0
|
||||
else:
|
||||
# Include tool schemas — with 50+ tools enabled
|
||||
# these add 20-30K tokens the messages-only
|
||||
|
||||
@ -41,6 +41,8 @@ class TestShouldCompress:
|
||||
|
||||
class TestUpdateFromResponse:
|
||||
def test_updates_fields(self, compressor):
|
||||
compressor.awaiting_real_usage_after_compression = True
|
||||
compressor.last_compression_rough_tokens = 90_000
|
||||
compressor.update_from_response({
|
||||
"prompt_tokens": 5000,
|
||||
"completion_tokens": 1000,
|
||||
@ -48,12 +50,39 @@ class TestUpdateFromResponse:
|
||||
})
|
||||
assert compressor.last_prompt_tokens == 5000
|
||||
assert compressor.last_completion_tokens == 1000
|
||||
assert compressor.last_real_prompt_tokens == 5000
|
||||
assert compressor.last_rough_tokens_when_real_prompt_fit == 90_000
|
||||
assert compressor.awaiting_real_usage_after_compression is False
|
||||
|
||||
def test_missing_fields_default_zero(self, compressor):
|
||||
compressor.update_from_response({})
|
||||
assert compressor.last_prompt_tokens == 0
|
||||
|
||||
|
||||
class TestPreflightDeferral:
|
||||
def test_defers_when_recent_real_usage_fit_and_rough_growth_is_small(self, compressor):
|
||||
compressor.threshold_tokens = 85_000
|
||||
compressor.last_real_prompt_tokens = 50_000
|
||||
compressor.last_rough_tokens_when_real_prompt_fit = 90_000
|
||||
|
||||
assert compressor.should_defer_preflight_to_real_usage(93_000) is True
|
||||
assert compressor.last_rough_tokens_when_real_prompt_fit == 93_000
|
||||
|
||||
def test_does_not_defer_when_rough_growth_is_large(self, compressor):
|
||||
compressor.threshold_tokens = 85_000
|
||||
compressor.last_real_prompt_tokens = 50_000
|
||||
compressor.last_rough_tokens_when_real_prompt_fit = 90_000
|
||||
|
||||
assert compressor.should_defer_preflight_to_real_usage(100_000) is False
|
||||
|
||||
def test_does_not_defer_without_recent_real_usage(self, compressor):
|
||||
compressor.threshold_tokens = 85_000
|
||||
compressor.last_real_prompt_tokens = 0
|
||||
compressor.last_rough_tokens_when_real_prompt_fit = 90_000
|
||||
|
||||
assert compressor.should_defer_preflight_to_real_usage(93_000) is False
|
||||
|
||||
|
||||
|
||||
class TestCompress:
|
||||
def _make_messages(self, n):
|
||||
|
||||
@ -491,6 +491,83 @@ class TestPreflightCompression:
|
||||
for ev, msg in status_messages
|
||||
)
|
||||
|
||||
def test_preflight_defers_when_recent_real_usage_fit(self, agent):
|
||||
"""A noisy rough estimate should not re-compact a recently fitting request."""
|
||||
agent.compression_enabled = True
|
||||
agent.context_compressor.context_length = 200_000
|
||||
agent.context_compressor.threshold_tokens = 100_000
|
||||
agent.context_compressor.last_prompt_tokens = 58_000
|
||||
agent.context_compressor.last_real_prompt_tokens = 58_000
|
||||
agent.context_compressor.last_rough_tokens_when_real_prompt_fit = 113_000
|
||||
|
||||
big_history = []
|
||||
for i in range(20):
|
||||
big_history.append({"role": "user", "content": f"Message {i} padded"})
|
||||
big_history.append({"role": "assistant", "content": f"Response {i} padded"})
|
||||
|
||||
ok_resp = _mock_response(
|
||||
content="Used real fit",
|
||||
finish_reason="stop",
|
||||
usage={"prompt_tokens": 59_000, "completion_tokens": 100, "total_tokens": 59_100},
|
||||
)
|
||||
agent.client.chat.completions.create.side_effect = [ok_resp]
|
||||
status_messages = []
|
||||
agent.status_callback = lambda ev, msg: status_messages.append((ev, msg))
|
||||
|
||||
with (
|
||||
patch("agent.conversation_loop.estimate_request_tokens_rough", return_value=114_000),
|
||||
patch.object(agent, "_compress_context") as mock_compress,
|
||||
patch.object(agent, "_persist_session"),
|
||||
patch.object(agent, "_save_trajectory"),
|
||||
patch.object(agent, "_cleanup_task_resources"),
|
||||
):
|
||||
result = agent.run_conversation("hello", conversation_history=big_history)
|
||||
|
||||
mock_compress.assert_not_called()
|
||||
assert result["completed"] is True
|
||||
assert result["final_response"] == "Used real fit"
|
||||
assert not any(
|
||||
ev == "lifecycle" and "Preflight compression" in msg
|
||||
for ev, msg in status_messages
|
||||
)
|
||||
|
||||
def test_preflight_compresses_when_rough_growth_after_fit_is_large(self, agent):
|
||||
"""Large rough growth after a fitting request still triggers preflight."""
|
||||
agent.compression_enabled = True
|
||||
agent.context_compressor.context_length = 200_000
|
||||
agent.context_compressor.threshold_tokens = 100_000
|
||||
agent.context_compressor.last_prompt_tokens = 58_000
|
||||
agent.context_compressor.last_real_prompt_tokens = 58_000
|
||||
agent.context_compressor.last_rough_tokens_when_real_prompt_fit = 113_000
|
||||
|
||||
big_history = []
|
||||
for i in range(20):
|
||||
big_history.append({"role": "user", "content": f"Message {i} padded"})
|
||||
big_history.append({"role": "assistant", "content": f"Response {i} padded"})
|
||||
|
||||
ok_resp = _mock_response(
|
||||
content="Compressed after growth",
|
||||
finish_reason="stop",
|
||||
usage={"prompt_tokens": 50_000, "completion_tokens": 100, "total_tokens": 50_100},
|
||||
)
|
||||
agent.client.chat.completions.create.side_effect = [ok_resp]
|
||||
|
||||
with (
|
||||
patch("agent.conversation_loop.estimate_request_tokens_rough", side_effect=[125_000, 40_000]),
|
||||
patch.object(agent, "_compress_context") as mock_compress,
|
||||
patch.object(agent, "_persist_session"),
|
||||
patch.object(agent, "_save_trajectory"),
|
||||
patch.object(agent, "_cleanup_task_resources"),
|
||||
):
|
||||
mock_compress.return_value = (
|
||||
[{"role": "user", "content": f"{SUMMARY_PREFIX}\nPrevious conversation"}],
|
||||
"new system prompt",
|
||||
)
|
||||
result = agent.run_conversation("hello", conversation_history=big_history)
|
||||
|
||||
mock_compress.assert_called_once()
|
||||
assert result["completed"] is True
|
||||
|
||||
def test_no_preflight_when_under_threshold(self, agent):
|
||||
"""When history fits within context, no preflight compression needed."""
|
||||
agent.compression_enabled = True
|
||||
|
||||
Reference in New Issue
Block a user