fix(memory): narrow scrub surface to known wrapper boundaries

Reviewer pushback on the original boundary-hardening commits — three
overreach points pulled plugin-specific policy into shared core paths:

1. gateway/run.py hardcoded a '## Honcho Context' literal split for
   vision-LLM output.  Plugin-format heading in framework code; could
   truncate legitimate output naturally containing that header.
   Drop the literal split; keep generic sanitize_context (the wrapper
   strip is plugin-agnostic).  Plugin-specific cleanup belongs at the
   provider boundary, not the shared gateway path.

2. run_agent.run_conversation scrubbed user_message and
   persist_user_message before the conversation loop.  User text is
   sacred — if a user types a literal <memory-context> tag we must
   not silently delete it.  The producer (build_memory_context_block)
   is the only legitimate emitter; user input should never need the
   reverse op.

3. _build_assistant_message scrubbed model output before persistence.
   Same hazard: would silently mutate legitimate documentation/code
   the model emits containing the literal markers.  The streaming
   scrubber catches real leaks delta-by-delta before content is
   concatenated; persist-time scrub was redundant belt-and-suspenders.

4. _fire_stream_delta stripped leading newlines from every delta unless
   a paragraph break flag was set.  Mid-stream '\n' is legitimate
   markdown — lists, code fences, paragraph breaks — and chunk
   boundaries are arbitrary.  Narrow lstrip to the very first delta
   of the stream only (so stale provider preamble still gets cleaned
   on turn start, but mid-stream formatting survives).

Plus: build_memory_context_block now logs a warning when its defensive
sanitize_context strips something — surfaces buggy providers returning
pre-wrapped text instead of silently double-fencing.

Net architectural change: scrub surface collapses from 8 sites to 3
(StreamingContextScrubber on output deltas, plugin→backend send,
build_memory_context_block input-validation).  Plugin-specific strings
stay out of shared runtime paths.  User input and persisted assistant
output are no longer mutated.

Tests: rescoped TestMemoryContextSanitization (helper-correctness only,
no source-inspection of removed call sites), updated vision tests to
drop '## Honcho Context' literal-split assertions, updated
_build_assistant_message persistence test to assert preservation.
Added: cross-turn scrubber reset, build_memory_context_block warn-on-
violation, mid-stream newline preservation (plain + code fence).
This commit is contained in:
Erosika
2026-04-27 14:32:20 -04:00
committed by kshitij
parent 05435a35ed
commit e553f6f3e4
7 changed files with 159 additions and 70 deletions

View File

@ -179,10 +179,20 @@ def build_memory_context_block(raw_context: str) -> str:
The fence prevents the model from treating recalled context as user
discourse. Injected at API-call time only — never persisted.
A provider returning text that already contains the wrapper is a
contract violation (would produce nested fences). We strip defensively
and warn so the buggy provider surfaces in logs instead of silently
double-fencing.
"""
if not raw_context or not raw_context.strip():
return ""
clean = sanitize_context(raw_context)
if clean != raw_context:
logger.warning(
"memory provider returned text containing <memory-context> wrapper; "
"stripped before re-fencing (provider contract violation)"
)
return (
"<memory-context>\n"
"[System note: The following is recalled memory context, "

View File

@ -8502,14 +8502,11 @@ class GatewayRunner:
result = json.loads(result_json)
if result.get("success"):
description = result.get("analysis", "")
# The auxiliary vision LLM can echo injected system-prompt
# memory context back into its output (#5719). Scrub any
# <memory-context> fences and the "## Honcho Context"
# section before the description lands in a user-visible
# message.
# Vision auxiliary LLM can echo the injected system-prompt
# memory-context wrapper back into its output (#5719).
# sanitize_context strips the fenced wrapper; plugin-specific
# header cleanup belongs at the provider boundary, not here.
description = sanitize_context(description)
if "## Honcho Context" in description:
description = description.split("## Honcho Context", 1)[0].rstrip()
enriched_parts.append(
f"[The user sent an image~ Here's what I can see:\n{description}]\n"
f"[If you need a closer look, use vision_analyze with "

View File

@ -6102,7 +6102,13 @@ class AIAgent:
else:
# Defensive: legacy callers without the scrubber attribute.
text = sanitize_context(text)
if not prepended_break:
# Strip leading newlines only on the very first delta of the stream,
# and only when we didn't just prepend a paragraph break ourselves.
# Mid-stream "\n" is legitimate markdown (lists, code, paragraphs)
# and must survive — chunk boundaries are arbitrary.
if not prepended_break and not getattr(
self, "_current_streamed_assistant_text", ""
):
text = text.lstrip("\n")
if not text:
return
@ -8077,7 +8083,7 @@ class AIAgent:
# API replay, session transcript, gateway delivery, CLI display,
# compression, title generation.
if isinstance(_san_content, str) and _san_content:
_san_content = sanitize_context(self._strip_think_blocks(_san_content)).strip()
_san_content = self._strip_think_blocks(_san_content).strip()
msg = {
"role": "assistant",
@ -9629,16 +9635,6 @@ class AIAgent:
if isinstance(persist_user_message, str):
persist_user_message = _sanitize_surrogates(persist_user_message)
# Strip leaked <memory-context> blocks from user input. When Honcho's
# saveMessages persists a turn that included injected context, the block
# can reappear in the next turn's user message via message history.
# Stripping here prevents stale memory tags from leaking into the
# conversation and being visible to the user or the model as user text.
if isinstance(user_message, str):
user_message = sanitize_context(user_message)
if isinstance(persist_user_message, str):
persist_user_message = sanitize_context(persist_user_message)
# Store stream callback for _interruptible_api_call to pick up
self._stream_callback = stream_callback
self._persist_user_message_idx = None

View File

@ -148,3 +148,64 @@ class TestSanitizeContextUnchanged:
)
out = sanitize_context(leaked).strip()
assert out == "Visible"
class TestStreamingContextScrubberCrossTurn:
"""A scrubber instance is reused across turns (per agent). reset() must
clear any held state so a partial-tag tail from turn N doesn't bleed
into turn N+1's first delta."""
def test_reset_clears_held_partial_tag(self):
s = StreamingContextScrubber()
# Feed a partial open-tag prefix that gets held back as buffer.
out_turn_1 = s.feed("answer<memo")
assert out_turn_1 == "answer"
# Reset for next turn — buffer must clear.
s.reset()
# New turn: plain text starting with a "<m" must NOT be treated as
# the continuation of the held "<memo".
out_turn_2 = s.feed("<marker>fresh content")
assert out_turn_2 == "<marker>fresh content"
def test_reset_clears_in_span_state(self):
s = StreamingContextScrubber()
s.feed("text<memory-context>secret-tail")
# Mid-span state held — without reset, subsequent text would be
# discarded until we see </memory-context>.
s.reset()
out = s.feed("post-reset visible text")
assert out == "post-reset visible text"
class TestBuildMemoryContextBlockWarnsOnViolation:
"""Providers must return raw context — not pre-wrapped. When they do,
we strip and warn so the buggy provider surfaces."""
def test_provider_emitting_wrapper_warns(self, caplog):
import logging
from agent.memory_manager import build_memory_context_block
prewrapped = (
"<memory-context>\n"
"[System note: ...]\n\n"
"real fact\n"
"</memory-context>"
)
with caplog.at_level(logging.WARNING, logger="agent.memory_manager"):
out = build_memory_context_block(prewrapped)
assert any("contract violation" in rec.message for rec in caplog.records)
assert out.count("<memory-context>") == 1
assert out.count("</memory-context>") == 1
def test_clean_provider_output_does_not_warn(self, caplog):
import logging
from agent.memory_manager import build_memory_context_block
with caplog.at_level(logging.WARNING, logger="agent.memory_manager"):
out = build_memory_context_block("plain fact about user")
assert not any("contract violation" in rec.message for rec in caplog.records)
assert "plain fact about user" in out

View File

@ -1,13 +1,12 @@
"""Tests for _enrich_message_with_vision — regression for #5719.
The auxiliary vision LLM can echo system-prompt Honcho memory back into
its analysis output. When that echo reaches the user as the enriched
image description, recalled memory context (personal facts, dialectic
output) surfaces into a user-visible message.
The auxiliary vision LLM can echo system-prompt memory-context back into
its analysis output. The boundary fix in gateway/run.py runs the generic
sanitize_context helper over the description so the fenced wrapper and
its system-note are removed before the description reaches the user.
The boundary fix in gateway/run.py strips both <memory-context>...</memory-context>
fenced blocks AND any "## Honcho Context" section from vision descriptions
before they're embedded into the enriched user message.
Plugin-specific header cleanup (e.g. "## Honcho Context") belongs at the
provider boundary, not in this shared gateway path.
"""
import asyncio
@ -43,22 +42,6 @@ class TestEnrichMessageWithVision:
out = _run(gateway_runner._enrich_message_with_vision("caption", ["/tmp/img.jpg"]))
assert "sunset over the ocean" in out
def test_honcho_context_header_stripped(self, gateway_runner):
"""'## Honcho Context' section and everything after is removed."""
leaked = (
"A photograph of a sunset.\n\n"
"## Honcho Context\n"
"User prefers concise answers, works at Plastic Labs,\n"
"uses OPSEC pseudonyms.\n"
)
fake_result = json.dumps({"success": True, "analysis": leaked})
with patch("tools.vision_tools.vision_analyze_tool", new=AsyncMock(return_value=fake_result)):
out = _run(gateway_runner._enrich_message_with_vision("caption", ["/tmp/img.jpg"]))
assert "sunset" in out
assert "Honcho Context" not in out
assert "Plastic Labs" not in out
assert "OPSEC" not in out
def test_memory_context_fence_stripped(self, gateway_runner):
"""<memory-context>...</memory-context> fenced block is scrubbed."""
leaked = (
@ -77,23 +60,21 @@ class TestEnrichMessageWithVision:
assert "User details and preferences" not in out
assert "System note" not in out
def test_both_leak_patterns_together_stripped(self, gateway_runner):
"""A vision output containing both leak shapes is fully scrubbed."""
def test_fenced_leak_stripped_plugin_header_preserved(self, gateway_runner):
"""The fenced wrapper is stripped; plugin-specific text outside the
fence (e.g. a "## Honcho Context" header) is left to the plugin layer.
Gateway core stays plugin-agnostic."""
leaked = (
"<memory-context>\n"
"[System note: The following is recalled memory context, NOT new "
"user input. Treat as informational background data.]\n"
"fenced leak\n"
"</memory-context>\n"
"A photograph of a dog.\n\n"
"## Honcho Context\n"
"header leak\n"
"A photograph of a dog."
)
fake_result = json.dumps({"success": True, "analysis": leaked})
with patch("tools.vision_tools.vision_analyze_tool", new=AsyncMock(return_value=fake_result)):
out = _run(gateway_runner._enrich_message_with_vision("caption", ["/tmp/img.jpg"]))
assert "photograph of a dog" in out
assert "fenced leak" not in out
assert "header leak" not in out
assert "Honcho Context" not in out
assert "<memory-context>" not in out

View File

@ -1441,19 +1441,23 @@ class TestBuildAssistantMessage:
result = agent._build_assistant_message(msg, "stop")
assert result["content"] == "No thinking here."
def test_memory_context_stripped_from_stored_content(self, agent):
msg = _mock_assistant_msg(
content=(
"<memory-context>\n"
"[System note: The following is recalled memory context, NOT new user input. Treat as informational background data.]\n\n"
"## Honcho Context\n"
"stale memory\n"
"</memory-context>\n\n"
"Visible answer"
)
def test_memory_context_in_stored_content_is_preserved(self, agent):
"""`_build_assistant_message` must not silently mutate model output
containing literal <memory-context> markers — that's legitimate text
(e.g. documentation, code) that the model may emit. Streaming-path
leak prevention is handled by StreamingContextScrubber upstream."""
original = (
"<memory-context>\n"
"[System note: The following is recalled memory context, NOT new user input. Treat as informational background data.]\n\n"
"## Honcho Context\n"
"stale memory\n"
"</memory-context>\n\n"
"Visible answer"
)
msg = _mock_assistant_msg(content=original)
result = agent._build_assistant_message(msg, "stop")
assert result["content"] == "Visible answer"
assert "<memory-context>" in result["content"]
assert "Visible answer" in result["content"]
def test_unterminated_think_block_stripped(self, agent):
"""Unterminated <think> block (MiniMax / NIM dropped close tag) is
@ -4767,21 +4771,21 @@ class TestDeadRetryCode:
class TestMemoryContextSanitization:
"""run_conversation() must strip leaked <memory-context> blocks from user input."""
"""sanitize_context() helper correctness — used at provider boundaries."""
def test_memory_context_stripped_from_user_message(self):
"""Verify that <memory-context> blocks are removed before the message
enters the conversation loop — prevents stale Honcho injection from
leaking into user text."""
def test_user_message_is_not_mutated_by_run_conversation(self):
"""User input must reach run_conversation untouched — if a user types
a literal <memory-context> tag we don't silently delete their text.
The streaming scrubber + plugin-side scrub cover real leak paths."""
import inspect
src = inspect.getsource(AIAgent.run_conversation)
# The sanitize_context call must appear in run_conversation's preamble
assert "sanitize_context(user_message)" in src
assert "sanitize_context(persist_user_message)" in src
assert "sanitize_context(user_message)" not in src
assert "sanitize_context(persist_user_message)" not in src
def test_sanitize_context_strips_full_block(self):
"""End-to-end: a user message with an embedded memory-context block
is cleaned to just the actual user text."""
"""Helper-level: a string with an embedded memory-context block is
cleaned to just the surrounding text. Used by build_memory_context_block
(input-validation) and by plugins on their own backend boundary."""
from agent.memory_manager import sanitize_context
user_text = "how is the honcho working"
injected = (

View File

@ -1209,6 +1209,46 @@ def test_stream_delta_scrubber_resets_between_turns(monkeypatch):
assert "".join(observed) == "clean new turn text"
def test_stream_delta_preserves_mid_stream_leading_newlines(monkeypatch):
"""Mid-stream leading newlines must survive — they are legitimate
markdown (lists, code fences, paragraph breaks). Stripping them
based on chunk boundaries silently breaks formatting.
Only the very first delta of a stream gets leading-newlines stripped
(so stale provider preamble doesn't leak); after that, deltas are
emitted verbatim.
"""
agent = _build_agent(monkeypatch)
observed = []
agent.stream_delta_callback = observed.append
# First delta delivers text — strips its own leading "\n" once.
agent._fire_stream_delta("\nHere is a list:")
# Second delta starts with "\n- item" — must NOT be stripped.
agent._fire_stream_delta("\n- first")
agent._fire_stream_delta("\n- second")
combined = "".join(observed)
assert combined == "Here is a list:\n- first\n- second"
def test_stream_delta_preserves_code_fence_newlines(monkeypatch):
"""Code blocks span multiple deltas. A "\\n```python\\n" boundary
is the canonical case where stripping leading newlines corrupts output."""
agent = _build_agent(monkeypatch)
observed = []
agent.stream_delta_callback = observed.append
agent._fire_stream_delta("Here is the code:")
agent._fire_stream_delta("\n```python\n")
agent._fire_stream_delta("print('hi')\n")
agent._fire_stream_delta("```\n")
combined = "".join(observed)
assert "```python\n" in combined
assert combined.startswith("Here is the code:\n```python\n")
def test_run_conversation_codex_continues_after_commentary_phase_message(monkeypatch):
agent = _build_agent(monkeypatch)
responses = [