fix(memory): narrow scrub surface to known wrapper boundaries

Reviewer pushback on the original boundary-hardening commits — three overreach points pulled plugin-specific policy into shared core paths: 1. gateway/run.py hardcoded a '## Honcho Context' literal split for vision-LLM output. Plugin-format heading in framework code; could truncate legitimate output naturally containing that header. Drop the literal split; keep generic sanitize_context (the wrapper strip is plugin-agnostic). Plugin-specific cleanup belongs at the provider boundary, not the shared gateway path. 2. run_agent.run_conversation scrubbed user_message and persist_user_message before the conversation loop. User text is sacred — if a user types a literal <memory-context> tag we must not silently delete it. The producer (build_memory_context_block) is the only legitimate emitter; user input should never need the reverse op. 3. _build_assistant_message scrubbed model output before persistence. Same hazard: would silently mutate legitimate documentation/code the model emits containing the literal markers. The streaming scrubber catches real leaks delta-by-delta before content is concatenated; persist-time scrub was redundant belt-and-suspenders. 4. _fire_stream_delta stripped leading newlines from every delta unless a paragraph break flag was set. Mid-stream '\n' is legitimate markdown — lists, code fences, paragraph breaks — and chunk boundaries are arbitrary. Narrow lstrip to the very first delta of the stream only (so stale provider preamble still gets cleaned on turn start, but mid-stream formatting survives). Plus: build_memory_context_block now logs a warning when its defensive sanitize_context strips something — surfaces buggy providers returning pre-wrapped text instead of silently double-fencing. Net architectural change: scrub surface collapses from 8 sites to 3 (StreamingContextScrubber on output deltas, plugin→backend send, build_memory_context_block input-validation). Plugin-specific strings stay out of shared runtime paths. User input and persisted assistant output are no longer mutated. Tests: rescoped TestMemoryContextSanitization (helper-correctness only, no source-inspection of removed call sites), updated vision tests to drop '## Honcho Context' literal-split assertions, updated _build_assistant_message persistence test to assert preservation. Added: cross-turn scrubber reset, build_memory_context_block warn-on- violation, mid-stream newline preservation (plain + code fence).
2026-04-27 14:32:20 -04:00
parent 05435a35ed
commit e553f6f3e4
7 changed files with 159 additions and 70 deletions
--- a/agent/memory_manager.py
+++ b/agent/memory_manager.py
@ -179,10 +179,20 @@ def build_memory_context_block(raw_context: str) -> str:

    The fence prevents the model from treating recalled context as user
    discourse.  Injected at API-call time only — never persisted.
+
+    A provider returning text that already contains the wrapper is a
+    contract violation (would produce nested fences).  We strip defensively
+    and warn so the buggy provider surfaces in logs instead of silently
+    double-fencing.
    """
    if not raw_context or not raw_context.strip():
        return ""
    clean = sanitize_context(raw_context)
+    if clean != raw_context:
+        logger.warning(
+            "memory provider returned text containing <memory-context> wrapper; "
+            "stripped before re-fencing (provider contract violation)"
+        )
    return (
        "<memory-context>\n"
        "[System note: The following is recalled memory context, "
--- a/gateway/run.py
+++ b/gateway/run.py
@ -8502,14 +8502,11 @@ class GatewayRunner:
                result = json.loads(result_json)
                if result.get("success"):
                    description = result.get("analysis", "")
-                    # The auxiliary vision LLM can echo injected system-prompt
-                    # memory context back into its output (#5719).  Scrub any
-                    # <memory-context> fences and the "## Honcho Context"
-                    # section before the description lands in a user-visible
-                    # message.
+                    # Vision auxiliary LLM can echo the injected system-prompt
+                    # memory-context wrapper back into its output (#5719).
+                    # sanitize_context strips the fenced wrapper; plugin-specific
+                    # header cleanup belongs at the provider boundary, not here.
                    description = sanitize_context(description)
-                    if "## Honcho Context" in description:
-                        description = description.split("## Honcho Context", 1)[0].rstrip()
                    enriched_parts.append(
                        f"[The user sent an image~ Here's what I can see:\n{description}]\n"
                        f"[If you need a closer look, use vision_analyze with "
--- a/run_agent.py
+++ b/run_agent.py
@ -6102,7 +6102,13 @@ class AIAgent:
            else:
                # Defensive: legacy callers without the scrubber attribute.
                text = sanitize_context(text)
-            if not prepended_break:
+            # Strip leading newlines only on the very first delta of the stream,
+            # and only when we didn't just prepend a paragraph break ourselves.
+            # Mid-stream "\n" is legitimate markdown (lists, code, paragraphs)
+            # and must survive — chunk boundaries are arbitrary.
+            if not prepended_break and not getattr(
+                self, "_current_streamed_assistant_text", ""
+            ):
                text = text.lstrip("\n")
        if not text:
            return
@ -8077,7 +8083,7 @@ class AIAgent:
        # API replay, session transcript, gateway delivery, CLI display,
        # compression, title generation.
        if isinstance(_san_content, str) and _san_content:
-            _san_content = sanitize_context(self._strip_think_blocks(_san_content)).strip()
+            _san_content = self._strip_think_blocks(_san_content).strip()

        msg = {
            "role": "assistant",
@ -9629,16 +9635,6 @@ class AIAgent:
        if isinstance(persist_user_message, str):
            persist_user_message = _sanitize_surrogates(persist_user_message)

-        # Strip leaked <memory-context> blocks from user input.  When Honcho's
-        # saveMessages persists a turn that included injected context, the block
-        # can reappear in the next turn's user message via message history.
-        # Stripping here prevents stale memory tags from leaking into the
-        # conversation and being visible to the user or the model as user text.
-        if isinstance(user_message, str):
-            user_message = sanitize_context(user_message)
-        if isinstance(persist_user_message, str):
-            persist_user_message = sanitize_context(persist_user_message)
-
        # Store stream callback for _interruptible_api_call to pick up
        self._stream_callback = stream_callback
        self._persist_user_message_idx = None
--- a/tests/agent/test_streaming_context_scrubber.py
+++ b/tests/agent/test_streaming_context_scrubber.py
@ -148,3 +148,64 @@ class TestSanitizeContextUnchanged:
        )
        out = sanitize_context(leaked).strip()
        assert out == "Visible"
+
+
+class TestStreamingContextScrubberCrossTurn:
+    """A scrubber instance is reused across turns (per agent).  reset() must
+    clear any held state so a partial-tag tail from turn N doesn't bleed
+    into turn N+1's first delta."""
+
+    def test_reset_clears_held_partial_tag(self):
+        s = StreamingContextScrubber()
+        # Feed a partial open-tag prefix that gets held back as buffer.
+        out_turn_1 = s.feed("answer<memo")
+        assert out_turn_1 == "answer"
+
+        # Reset for next turn — buffer must clear.
+        s.reset()
+
+        # New turn: plain text starting with a "<m" must NOT be treated as
+        # the continuation of the held "<memo".
+        out_turn_2 = s.feed("<marker>fresh content")
+        assert out_turn_2 == "<marker>fresh content"
+
+    def test_reset_clears_in_span_state(self):
+        s = StreamingContextScrubber()
+        s.feed("text<memory-context>secret-tail")
+        # Mid-span state held — without reset, subsequent text would be
+        # discarded until we see </memory-context>.
+        s.reset()
+        out = s.feed("post-reset visible text")
+        assert out == "post-reset visible text"
+
+
+class TestBuildMemoryContextBlockWarnsOnViolation:
+    """Providers must return raw context — not pre-wrapped.  When they do,
+    we strip and warn so the buggy provider surfaces."""
+
+    def test_provider_emitting_wrapper_warns(self, caplog):
+        import logging
+        from agent.memory_manager import build_memory_context_block
+
+        prewrapped = (
+            "<memory-context>\n"
+            "[System note: ...]\n\n"
+            "real fact\n"
+            "</memory-context>"
+        )
+        with caplog.at_level(logging.WARNING, logger="agent.memory_manager"):
+            out = build_memory_context_block(prewrapped)
+
+        assert any("contract violation" in rec.message for rec in caplog.records)
+        assert out.count("<memory-context>") == 1
+        assert out.count("</memory-context>") == 1
+
+    def test_clean_provider_output_does_not_warn(self, caplog):
+        import logging
+        from agent.memory_manager import build_memory_context_block
+
+        with caplog.at_level(logging.WARNING, logger="agent.memory_manager"):
+            out = build_memory_context_block("plain fact about user")
+
+        assert not any("contract violation" in rec.message for rec in caplog.records)
+        assert "plain fact about user" in out
--- a/tests/gateway/test_vision_memory_leak.py
+++ b/tests/gateway/test_vision_memory_leak.py
@ -1,13 +1,12 @@
 """Tests for _enrich_message_with_vision — regression for #5719.

-The auxiliary vision LLM can echo system-prompt Honcho memory back into
-its analysis output. When that echo reaches the user as the enriched
-image description, recalled memory context (personal facts, dialectic
-output) surfaces into a user-visible message.
+The auxiliary vision LLM can echo system-prompt memory-context back into
+its analysis output.  The boundary fix in gateway/run.py runs the generic
+sanitize_context helper over the description so the fenced wrapper and
+its system-note are removed before the description reaches the user.

-The boundary fix in gateway/run.py strips both <memory-context>...</memory-context>
-fenced blocks AND any "## Honcho Context" section from vision descriptions
-before they're embedded into the enriched user message.
+Plugin-specific header cleanup (e.g. "## Honcho Context") belongs at the
+provider boundary, not in this shared gateway path.
 """

 import asyncio
@ -43,22 +42,6 @@ class TestEnrichMessageWithVision:
            out = _run(gateway_runner._enrich_message_with_vision("caption", ["/tmp/img.jpg"]))
        assert "sunset over the ocean" in out

-    def test_honcho_context_header_stripped(self, gateway_runner):
-        """'## Honcho Context' section and everything after is removed."""
-        leaked = (
-            "A photograph of a sunset.\n\n"
-            "## Honcho Context\n"
-            "User prefers concise answers, works at Plastic Labs,\n"
-            "uses OPSEC pseudonyms.\n"
-        )
-        fake_result = json.dumps({"success": True, "analysis": leaked})
-        with patch("tools.vision_tools.vision_analyze_tool", new=AsyncMock(return_value=fake_result)):
-            out = _run(gateway_runner._enrich_message_with_vision("caption", ["/tmp/img.jpg"]))
-        assert "sunset" in out
-        assert "Honcho Context" not in out
-        assert "Plastic Labs" not in out
-        assert "OPSEC" not in out
-
    def test_memory_context_fence_stripped(self, gateway_runner):
        """<memory-context>...</memory-context> fenced block is scrubbed."""
        leaked = (
@ -77,23 +60,21 @@ class TestEnrichMessageWithVision:
        assert "User details and preferences" not in out
        assert "System note" not in out

-    def test_both_leak_patterns_together_stripped(self, gateway_runner):
-        """A vision output containing both leak shapes is fully scrubbed."""
+    def test_fenced_leak_stripped_plugin_header_preserved(self, gateway_runner):
+        """The fenced wrapper is stripped; plugin-specific text outside the
+        fence (e.g. a "## Honcho Context" header) is left to the plugin layer.
+        Gateway core stays plugin-agnostic."""
        leaked = (
            "<memory-context>\n"
            "[System note: The following is recalled memory context, NOT new "
            "user input. Treat as informational background data.]\n"
            "fenced leak\n"
            "</memory-context>\n"
-            "A photograph of a dog.\n\n"
-            "## Honcho Context\n"
-            "header leak\n"
+            "A photograph of a dog."
        )
        fake_result = json.dumps({"success": True, "analysis": leaked})
        with patch("tools.vision_tools.vision_analyze_tool", new=AsyncMock(return_value=fake_result)):
            out = _run(gateway_runner._enrich_message_with_vision("caption", ["/tmp/img.jpg"]))
        assert "photograph of a dog" in out
        assert "fenced leak" not in out
-        assert "header leak" not in out
-        assert "Honcho Context" not in out
        assert "<memory-context>" not in out
--- a/tests/run_agent/test_run_agent.py
+++ b/tests/run_agent/test_run_agent.py
@ -1441,19 +1441,23 @@ class TestBuildAssistantMessage:
        result = agent._build_assistant_message(msg, "stop")
        assert result["content"] == "No thinking here."

-    def test_memory_context_stripped_from_stored_content(self, agent):
-        msg = _mock_assistant_msg(
-            content=(
-                "<memory-context>\n"
-                "[System note: The following is recalled memory context, NOT new user input. Treat as informational background data.]\n\n"
-                "## Honcho Context\n"
-                "stale memory\n"
-                "</memory-context>\n\n"
-                "Visible answer"
-            )
+    def test_memory_context_in_stored_content_is_preserved(self, agent):
+        """`_build_assistant_message` must not silently mutate model output
+        containing literal <memory-context> markers — that's legitimate text
+        (e.g. documentation, code) that the model may emit.  Streaming-path
+        leak prevention is handled by StreamingContextScrubber upstream."""
+        original = (
+            "<memory-context>\n"
+            "[System note: The following is recalled memory context, NOT new user input. Treat as informational background data.]\n\n"
+            "## Honcho Context\n"
+            "stale memory\n"
+            "</memory-context>\n\n"
+            "Visible answer"
        )
+        msg = _mock_assistant_msg(content=original)
        result = agent._build_assistant_message(msg, "stop")
-        assert result["content"] == "Visible answer"
+        assert "<memory-context>" in result["content"]
+        assert "Visible answer" in result["content"]

    def test_unterminated_think_block_stripped(self, agent):
        """Unterminated <think> block (MiniMax / NIM dropped close tag) is
@ -4767,21 +4771,21 @@ class TestDeadRetryCode:


 class TestMemoryContextSanitization:
-    """run_conversation() must strip leaked <memory-context> blocks from user input."""
+    """sanitize_context() helper correctness — used at provider boundaries."""

-    def test_memory_context_stripped_from_user_message(self):
-        """Verify that <memory-context> blocks are removed before the message
-        enters the conversation loop — prevents stale Honcho injection from
-        leaking into user text."""
+    def test_user_message_is_not_mutated_by_run_conversation(self):
+        """User input must reach run_conversation untouched — if a user types
+        a literal <memory-context> tag we don't silently delete their text.
+        The streaming scrubber + plugin-side scrub cover real leak paths."""
        import inspect
        src = inspect.getsource(AIAgent.run_conversation)
-        # The sanitize_context call must appear in run_conversation's preamble
-        assert "sanitize_context(user_message)" in src
-        assert "sanitize_context(persist_user_message)" in src
+        assert "sanitize_context(user_message)" not in src
+        assert "sanitize_context(persist_user_message)" not in src

    def test_sanitize_context_strips_full_block(self):
-        """End-to-end: a user message with an embedded memory-context block
-        is cleaned to just the actual user text."""
+        """Helper-level: a string with an embedded memory-context block is
+        cleaned to just the surrounding text.  Used by build_memory_context_block
+        (input-validation) and by plugins on their own backend boundary."""
        from agent.memory_manager import sanitize_context
        user_text = "how is the honcho working"
        injected = (
--- a/tests/run_agent/test_run_agent_codex_responses.py
+++ b/tests/run_agent/test_run_agent_codex_responses.py
@ -1209,6 +1209,46 @@ def test_stream_delta_scrubber_resets_between_turns(monkeypatch):
    assert "".join(observed) == "clean new turn text"


+def test_stream_delta_preserves_mid_stream_leading_newlines(monkeypatch):
+    """Mid-stream leading newlines must survive — they are legitimate
+    markdown (lists, code fences, paragraph breaks).  Stripping them
+    based on chunk boundaries silently breaks formatting.
+
+    Only the very first delta of a stream gets leading-newlines stripped
+    (so stale provider preamble doesn't leak); after that, deltas are
+    emitted verbatim.
+    """
+    agent = _build_agent(monkeypatch)
+    observed = []
+    agent.stream_delta_callback = observed.append
+
+    # First delta delivers text — strips its own leading "\n" once.
+    agent._fire_stream_delta("\nHere is a list:")
+    # Second delta starts with "\n- item" — must NOT be stripped.
+    agent._fire_stream_delta("\n- first")
+    agent._fire_stream_delta("\n- second")
+
+    combined = "".join(observed)
+    assert combined == "Here is a list:\n- first\n- second"
+
+
+def test_stream_delta_preserves_code_fence_newlines(monkeypatch):
+    """Code blocks span multiple deltas.  A "\\n```python\\n" boundary
+    is the canonical case where stripping leading newlines corrupts output."""
+    agent = _build_agent(monkeypatch)
+    observed = []
+    agent.stream_delta_callback = observed.append
+
+    agent._fire_stream_delta("Here is the code:")
+    agent._fire_stream_delta("\n```python\n")
+    agent._fire_stream_delta("print('hi')\n")
+    agent._fire_stream_delta("```\n")
+
+    combined = "".join(observed)
+    assert "```python\n" in combined
+    assert combined.startswith("Here is the code:\n```python\n")
+
+
 def test_run_conversation_codex_continues_after_commentary_phase_message(monkeypatch):
    agent = _build_agent(monkeypatch)
    responses = [