fix(gateway): keep JSON-embedded MEDIA: text verbatim in cleaned output

Self-review of #34375 fix: the cleanup path ran media_pattern.sub('') over the JSON-masked copy of the text, which baked the masking spaces into the user-visible 'cleaned' string — a serialized tool result like {"old":"MEDIA:/x.png"} came back as {"old":" "}. Now mask only a length-equal copy of 'cleaned' to locate the real tag spans, then delete those spans from the unmasked 'cleaned'. Real tags are stripped; JSON-embedded MEDIA: text reads back verbatim. Masking 'cleaned' (not the original 'content') keeps offsets valid after the [[audio_as_voice]] / [[as_document]] directives are removed. Adds two cleaned-text regression tests.
2026-06-01 11:52:57 +05:30
parent e8827ef704
commit fb1b681b3b
2 changed files with 35 additions and 5 deletions
--- a/gateway/platforms/base.py
+++ b/gateway/platforms/base.py
@ -2677,12 +2677,22 @@ class BasePlatformAdapter(ABC):
                    # and dropping every other attachment in the response.
                    continue

-        # Remove MEDIA tags from content (including surrounding quote/backtick wrappers)
+        # Remove the delivered MEDIA tags from the user-visible text. Mask
+        # ``cleaned`` (same length, so offsets line up with it), find the real
+        # tag spans there, and delete those spans from the *unmasked* ``cleaned``.
+        # This strips real tags while leaving JSON-embedded MEDIA: text intact —
+        # it is stored data, not a directive, and must read back verbatim
+        # (#34375). Masking ``cleaned`` (not ``content``) keeps offsets valid
+        # after the [[audio_as_voice]] / [[as_document]] directives are removed.
        if media:
-            # Mask JSON-embedded tags before sub so they stay intact in the
-            # user-visible text (they are stored data, not directives).
-            cleaned = media_pattern.sub('', BasePlatformAdapter._mask_json_string_media(cleaned))
-            cleaned = re.sub(r'\n{3,}', '\n\n', cleaned).strip()
+            masked_cleaned = BasePlatformAdapter._mask_json_string_media(cleaned)
+            spans = [m.span() for m in media_pattern.finditer(masked_cleaned)]
+            if spans:
+                chars = list(cleaned)
+                for start, end in sorted(spans, reverse=True):
+                    del chars[start:end]
+                cleaned = "".join(chars)
+                cleaned = re.sub(r'\n{3,}', '\n\n', cleaned).strip()
        
        return media, cleaned

--- a/tests/gateway/test_platform_base.py
+++ b/tests/gateway/test_platform_base.py
@ -467,6 +467,26 @@ class TestMediaInsideSerializedJson:
        assert len(media) == 1 and media[0][0] == "/tmp/v.ogg"
        assert media[0][1] is True  # voice flag

+    # --- cleaned-text invariants: real tags stripped, JSON data kept verbatim ---
+
+    def test_json_embedded_media_kept_verbatim_in_cleaned_text(self):
+        """A real tag is delivered+stripped; a JSON-embedded MEDIA: stays as
+        literal text (stored data must read back unchanged)."""
+        content = 'MEDIA:/real/r.png\nlog: {"old":"MEDIA:/stale/s.png"}'
+        media, cleaned = BasePlatformAdapter.extract_media(content)
+        assert [p for p, _ in media] == ["/real/r.png"]
+        # The JSON-embedded path must survive verbatim — not blanked to spaces.
+        assert '{"old":"MEDIA:/stale/s.png"}' in cleaned
+
+    def test_cleaned_text_after_directive_not_truncated(self):
+        """Stripping a tag preceded by a [[as_document]] directive must not
+        shift offsets and chop the path or trailing text."""
+        content = "See [[as_document]] MEDIA:/d/report.pdf now"
+        media, cleaned = BasePlatformAdapter.extract_media(content)
+        assert [p for p, _ in media] == ["/d/report.pdf"]
+        assert "MEDIA:" not in cleaned          # real tag removed
+        assert cleaned.endswith("now")          # trailing text intact (not chopped)
+

 class TestMediaExtensionAllowlistParity:
    """Regression coverage for issue #34517 — the MEDIA: extension black hole.