Files
hermes-agent/tests/gateway/test_stream_consumer_fresh_final.py
firefly 8bf498c21d fix(gateway): scope final-delivery flags to turn-final segment (#29346)
A streamed preamble ("Let me search...") finalized at a tool boundary
routed through _try_fresh_final, which unconditionally set
_final_response_sent=True even though it is a NON-final segment. The
gateway then reads that flag as "final delivered" and suppresses the
genuine final answer produced on the next API call, so the user silently
gets nothing. Only reproduces with fresh_final_after_seconds > 0.

- _try_fresh_final / _send_or_edit take is_turn_final; the segment-break
  call site passes is_turn_final=got_done so only the turn-final answer
  marks final-delivered.
- _reset_segment_state clears the final-delivery flags at every tool
  boundary as defense-in-depth against any future premature setter.
- Failing-first regression + happy-path no-duplicate test.
2026-06-01 17:31:32 -07:00

411 lines
18 KiB
Python

"""Regression tests for the fresh-final-for-long-lived-previews path.
Ported from openclaw/openclaw#72038. When a streamed preview has been
visible long enough that the platform's edit timestamp would be
noticeably stale by completion time, the stream consumer delivers the
final reply as a brand-new message and best-effort deletes the old
preview. This makes Telegram's visible timestamp reflect completion
time instead of first-token time.
"""
from __future__ import annotations
import asyncio
from types import SimpleNamespace
from unittest.mock import AsyncMock, MagicMock
import pytest
from gateway.stream_consumer import GatewayStreamConsumer, StreamConsumerConfig
def _make_adapter(*, supports_delete: bool = True) -> MagicMock:
"""Build a minimal MagicMock adapter wired for send/edit/delete."""
adapter = MagicMock()
adapter.REQUIRES_EDIT_FINALIZE = False
adapter.MAX_MESSAGE_LENGTH = 4096
adapter.send = AsyncMock(return_value=SimpleNamespace(
success=True, message_id="initial_preview",
))
adapter.edit_message = AsyncMock(return_value=SimpleNamespace(
success=True, message_id="initial_preview",
))
if supports_delete:
adapter.delete_message = AsyncMock(return_value=True)
else:
# Adapter without the optional delete_message method — fresh-final
# should still work, it just leaves the stale preview in place.
del adapter.delete_message # type: ignore[attr-defined]
return adapter
class TestFreshFinalForLongLivedPreviews:
"""openclaw#72038 port — send fresh final when preview is old."""
@pytest.mark.asyncio
async def test_disabled_by_default_still_edits_in_place(self):
"""``fresh_final_after_seconds=0`` preserves the legacy edit path."""
adapter = _make_adapter()
consumer = GatewayStreamConsumer(
adapter=adapter,
chat_id="chat",
config=StreamConsumerConfig(fresh_final_after_seconds=0.0),
)
await consumer._send_or_edit("hello")
# Pretend the preview has been visible for a long time.
consumer._message_created_ts = 0.0 # far in the past
await consumer._send_or_edit("hello world", finalize=True)
# Should edit, not send a fresh message.
assert adapter.send.call_count == 1 # only the initial send
adapter.edit_message.assert_called_once()
@pytest.mark.asyncio
async def test_short_lived_preview_edits_in_place(self):
"""Finalizing a preview younger than the threshold → normal edit."""
adapter = _make_adapter()
consumer = GatewayStreamConsumer(
adapter=adapter,
chat_id="chat",
config=StreamConsumerConfig(fresh_final_after_seconds=60.0),
)
await consumer._send_or_edit("hello")
# Preview is "new" — leave _message_created_ts at its real value.
await consumer._send_or_edit("hello world", finalize=True)
assert adapter.send.call_count == 1
adapter.edit_message.assert_called_once()
@pytest.mark.asyncio
async def test_long_lived_preview_sends_fresh_final(self):
"""Finalizing a preview older than the threshold → fresh send."""
adapter = _make_adapter()
adapter.send.side_effect = [
SimpleNamespace(success=True, message_id="initial_preview"),
SimpleNamespace(success=True, message_id="fresh_final"),
]
consumer = GatewayStreamConsumer(
adapter=adapter,
chat_id="chat",
config=StreamConsumerConfig(fresh_final_after_seconds=60.0),
)
await consumer._send_or_edit("hello")
# Force the preview to look stale (visible for > 60s).
consumer._message_created_ts = 0.0 # zero = ~uptime seconds old
await consumer._send_or_edit("hello world", finalize=True)
# Fresh send happened; no edit of the old preview.
assert adapter.send.call_count == 2
adapter.edit_message.assert_not_called()
# The old preview was deleted as cleanup.
adapter.delete_message.assert_awaited_once_with("chat", "initial_preview")
# State was updated to the new message id.
assert consumer._message_id == "fresh_final"
assert consumer._final_response_sent is True
@pytest.mark.asyncio
async def test_fresh_final_without_delete_support_is_best_effort(self):
"""Adapter lacking ``delete_message`` still gets the fresh send."""
adapter = _make_adapter(supports_delete=False)
adapter.send.side_effect = [
SimpleNamespace(success=True, message_id="initial_preview"),
SimpleNamespace(success=True, message_id="fresh_final"),
]
consumer = GatewayStreamConsumer(
adapter=adapter,
chat_id="chat",
config=StreamConsumerConfig(fresh_final_after_seconds=60.0),
)
await consumer._send_or_edit("hello")
consumer._message_created_ts = 0.0
await consumer._send_or_edit("hello world", finalize=True)
assert adapter.send.call_count == 2
adapter.edit_message.assert_not_called()
# No delete attempt — just the fresh send.
assert consumer._message_id == "fresh_final"
@pytest.mark.asyncio
async def test_fresh_final_fallback_to_edit_on_send_failure(self):
"""If the fresh send fails, fall back to the normal edit path."""
adapter = _make_adapter()
adapter.send.side_effect = [
SimpleNamespace(success=True, message_id="initial_preview"),
SimpleNamespace(success=False, error="network"),
]
consumer = GatewayStreamConsumer(
adapter=adapter,
chat_id="chat",
config=StreamConsumerConfig(fresh_final_after_seconds=60.0),
)
await consumer._send_or_edit("hello")
consumer._message_created_ts = 0.0
ok = await consumer._send_or_edit("hello world", finalize=True)
# Fresh send was attempted and failed → edit happened instead.
assert adapter.send.call_count == 2
adapter.edit_message.assert_called_once()
assert ok is True
@pytest.mark.asyncio
async def test_only_finalize_triggers_fresh_final(self):
"""Intermediate edits (``finalize=False``) never switch to fresh send."""
adapter = _make_adapter()
consumer = GatewayStreamConsumer(
adapter=adapter,
chat_id="chat",
config=StreamConsumerConfig(fresh_final_after_seconds=60.0),
)
await consumer._send_or_edit("hello")
consumer._message_created_ts = 0.0 # stale
await consumer._send_or_edit("hello partial") # no finalize
assert adapter.send.call_count == 1
adapter.edit_message.assert_called_once()
@pytest.mark.asyncio
async def test_no_edit_sentinel_is_not_affected(self):
"""Platforms with the ``__no_edit__`` sentinel never go fresh-final."""
adapter = _make_adapter()
adapter.send.return_value = SimpleNamespace(success=True, message_id=None)
consumer = GatewayStreamConsumer(
adapter=adapter,
chat_id="chat",
config=StreamConsumerConfig(fresh_final_after_seconds=60.0),
)
await consumer._send_or_edit("hello")
assert consumer._message_id == "__no_edit__"
assert consumer._message_created_ts is None
# Even with finalize=True, no fresh send — the sentinel gates it.
assert consumer._should_send_fresh_final() is False
class TestSegmentBreakDoesNotMarkFinalSent:
"""Regression for #29346 — silent response loss after tool calls.
When ``fresh_final_after_seconds > 0`` and a streamed *preamble* ("Let me
search…") has aged past the threshold, finalizing it at a tool boundary
used to route through ``_try_fresh_final``, which unconditionally set
``_final_response_sent = True`` even though this is a NON-final segment.
The gateway (run.py:18128) then reads that flag as "final delivered" and
suppresses the genuine final answer (which arrives on a later API call and
does not re-stream), so the user gets nothing.
The fix scopes the final-delivery flags to the turn-final segment and
clears them at every tool/segment boundary, so a preamble can never mark
the turn as delivered.
"""
@staticmethod
def _delivered_texts(adapter) -> list[str]:
"""Every text the adapter actually put on screen (sends + edits)."""
texts = [c.kwargs.get("content", "") for c in adapter.send.call_args_list]
texts += [c.kwargs.get("content", "") for c in adapter.edit_message.call_args_list]
return texts
@pytest.mark.asyncio
async def test_preamble_fresh_final_at_tool_boundary_does_not_mark_final(self):
"""Real-aging reproduction (exercises the actual _should_send_fresh_final
age gate, not a monkeypatch): a preamble ages past the threshold, then a
tool boundary finalizes it via fresh-final. The genuine final answer is
produced on a later API call and is NOT streamed through this consumer
(the #29346 repro), so the consumer must NOT believe the final was sent."""
adapter = _make_adapter()
consumer = GatewayStreamConsumer(
adapter=adapter,
chat_id="chat",
config=StreamConsumerConfig(
edit_interval=0.01, buffer_threshold=5, cursor="",
fresh_final_after_seconds=0.001, # tiny → real aging fires
),
)
consumer.on_delta("Let me search the web for that.")
task = asyncio.create_task(consumer.run())
await asyncio.sleep(0.05) # preamble sent + aged well past 0.001s
consumer.on_delta(None) # tool boundary → segment-break fresh-final
await asyncio.sleep(0.05)
consumer.finish()
await task
# Fresh-final actually engaged (preamble preview + a fresh resend), yet
# the turn is NOT marked delivered — no genuine final ever streamed.
assert adapter.send.call_count >= 2
assert consumer.final_response_sent is False
assert consumer.final_content_delivered is False
@pytest.mark.asyncio
async def test_final_answer_after_preamble_is_delivered_exactly_once(self):
"""P0 user-visible contract: when the real final answer DOES stream in
after the preamble + tool boundary, the user gets it exactly once AND
the consumer marks it delivered (so the gateway correctly suppresses a
redundant send)."""
adapter = _make_adapter()
consumer = GatewayStreamConsumer(
adapter=adapter,
chat_id="chat",
config=StreamConsumerConfig(
edit_interval=0.01, buffer_threshold=5, cursor="",
fresh_final_after_seconds=0.001,
),
)
consumer.on_delta("Let me search the web for that.")
task = asyncio.create_task(consumer.run())
await asyncio.sleep(0.05)
consumer.on_delta(None) # tool boundary
consumer.on_delta("The answer is 42.") # genuine final answer streams
await asyncio.sleep(0.05)
consumer.finish()
await task
# The real final answer was delivered → suppression must engage.
assert consumer.final_response_sent is True
# And it reached the user exactly once (no duplicate fresh send).
final_sends = [
c for c in adapter.send.call_args_list
if "answer is 42" in c.kwargs.get("content", "")
]
assert len(final_sends) <= 1
assert any("answer is 42" in t for t in self._delivered_texts(adapter))
@pytest.mark.asyncio
async def test_genuine_final_answer_without_tools_marks_delivered(self):
"""P1 happy path: a single answer streamed straight to completion (no
tool boundary) still sets final_response_sent so the gateway suppresses
the redundant final send."""
adapter = _make_adapter()
consumer = GatewayStreamConsumer(
adapter=adapter,
chat_id="chat",
config=StreamConsumerConfig(
edit_interval=0.01, buffer_threshold=5, cursor="",
fresh_final_after_seconds=60.0,
),
)
consumer.on_delta("Here is the full answer.")
task = asyncio.create_task(consumer.run())
await asyncio.sleep(0.05)
consumer.finish()
await task
assert consumer.final_response_sent is True
assert any("Here is the full answer." in t for t in self._delivered_texts(adapter))
@pytest.mark.asyncio
async def test_no_edit_adapter_delivers_final_after_preamble(self):
"""No-edit adapters (Signal/SMS/webhook → __no_edit__) accumulate and
deliver rather than fresh-final. A preamble before a tool call must not
swallow the genuine final answer — it must reach the user."""
adapter = _make_adapter()
adapter.send.return_value = SimpleNamespace(success=True, message_id=None)
consumer = GatewayStreamConsumer(
adapter=adapter,
chat_id="chat",
config=StreamConsumerConfig(
edit_interval=0.01, buffer_threshold=5, cursor="",
fresh_final_after_seconds=0.001,
),
)
consumer.on_delta("Let me search the web for that.")
task = asyncio.create_task(consumer.run())
await asyncio.sleep(0.05)
consumer.on_delta(None) # tool boundary
consumer.on_delta("The answer is 42.") # genuine final answer
await asyncio.sleep(0.05)
consumer.finish()
await task
# The final answer reached the user, not swallowed by the preamble.
assert any(
"answer is 42" in c.kwargs.get("content", "")
for c in adapter.send.call_args_list
)
@pytest.mark.asyncio
async def test_multi_tool_call_turn_delivers_final_once(self):
"""Two tool boundaries before the final answer: flags stay clear across
both boundaries and the genuine final is delivered exactly once and
marked sent."""
adapter = _make_adapter()
consumer = GatewayStreamConsumer(
adapter=adapter,
chat_id="chat",
config=StreamConsumerConfig(
edit_interval=0.01, buffer_threshold=5, cursor="",
fresh_final_after_seconds=0.001,
),
)
consumer.on_delta("Let me check a couple of things.")
task = asyncio.create_task(consumer.run())
await asyncio.sleep(0.05)
consumer.on_delta(None) # tool boundary 1
consumer.on_delta("Now cross-referencing.")
await asyncio.sleep(0.05)
consumer.on_delta(None) # tool boundary 2
consumer.on_delta("The answer is 42.") # genuine final answer
await asyncio.sleep(0.05)
consumer.finish()
await task
assert consumer.final_response_sent is True
final_sends = [
c for c in adapter.send.call_args_list
if "answer is 42" in c.kwargs.get("content", "")
]
assert len(final_sends) <= 1
assert any("answer is 42" in t for t in self._delivered_texts(adapter))
class TestStreamConsumerConfigFreshFinalField:
"""The dataclass field must exist and default to 0 (disabled)."""
def test_default_is_disabled(self):
cfg = StreamConsumerConfig()
assert cfg.fresh_final_after_seconds == 0.0
def test_field_is_configurable(self):
cfg = StreamConsumerConfig(fresh_final_after_seconds=120.0)
assert cfg.fresh_final_after_seconds == 120.0
class TestStreamingConfigFreshFinalField:
"""The gateway-level StreamingConfig carries the setting."""
def test_default_enables_with_60s(self):
from gateway.config import StreamingConfig
cfg = StreamingConfig()
assert cfg.fresh_final_after_seconds == 60.0
def test_from_dict_uses_default_when_missing(self):
from gateway.config import StreamingConfig
cfg = StreamingConfig.from_dict({"enabled": True})
assert cfg.fresh_final_after_seconds == 60.0
def test_from_dict_respects_explicit_zero(self):
from gateway.config import StreamingConfig
cfg = StreamingConfig.from_dict({
"enabled": True,
"fresh_final_after_seconds": 0,
})
assert cfg.fresh_final_after_seconds == 0.0
def test_to_dict_round_trip(self):
from gateway.config import StreamingConfig
original = StreamingConfig(fresh_final_after_seconds=90.0)
restored = StreamingConfig.from_dict(original.to_dict())
assert restored.fresh_final_after_seconds == 90.0
class TestTelegramAdapterDeleteMessage:
"""Contract: Telegram adapter implements ``delete_message``."""
def test_delete_message_method_exists(self):
telegram = pytest.importorskip("gateway.platforms.telegram")
import inspect
cls = telegram.TelegramAdapter
assert hasattr(cls, "delete_message"), (
"TelegramAdapter.delete_message is required for the fresh-final "
"cleanup path (openclaw/openclaw#72038 port)."
)
sig = inspect.signature(cls.delete_message)
params = list(sig.parameters)
assert params[:3] == ["self", "chat_id", "message_id"]
def test_base_adapter_default_returns_false(self):
"""BasePlatformAdapter.delete_message default = no-op returning False."""
from gateway.platforms.base import BasePlatformAdapter
import inspect
sig = inspect.signature(BasePlatformAdapter.delete_message)
assert list(sig.parameters)[:3] == ["self", "chat_id", "message_id"]