diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py index e75400085..c202d0564 100644 --- a/gateway/platforms/base.py +++ b/gateway/platforms/base.py @@ -2677,12 +2677,22 @@ class BasePlatformAdapter(ABC): # and dropping every other attachment in the response. continue - # Remove MEDIA tags from content (including surrounding quote/backtick wrappers) + # Remove the delivered MEDIA tags from the user-visible text. Mask + # ``cleaned`` (same length, so offsets line up with it), find the real + # tag spans there, and delete those spans from the *unmasked* ``cleaned``. + # This strips real tags while leaving JSON-embedded MEDIA: text intact — + # it is stored data, not a directive, and must read back verbatim + # (#34375). Masking ``cleaned`` (not ``content``) keeps offsets valid + # after the [[audio_as_voice]] / [[as_document]] directives are removed. if media: - # Mask JSON-embedded tags before sub so they stay intact in the - # user-visible text (they are stored data, not directives). - cleaned = media_pattern.sub('', BasePlatformAdapter._mask_json_string_media(cleaned)) - cleaned = re.sub(r'\n{3,}', '\n\n', cleaned).strip() + masked_cleaned = BasePlatformAdapter._mask_json_string_media(cleaned) + spans = [m.span() for m in media_pattern.finditer(masked_cleaned)] + if spans: + chars = list(cleaned) + for start, end in sorted(spans, reverse=True): + del chars[start:end] + cleaned = "".join(chars) + cleaned = re.sub(r'\n{3,}', '\n\n', cleaned).strip() return media, cleaned diff --git a/tests/gateway/test_platform_base.py b/tests/gateway/test_platform_base.py index 4fe61adf2..7c10e661f 100644 --- a/tests/gateway/test_platform_base.py +++ b/tests/gateway/test_platform_base.py @@ -467,6 +467,26 @@ class TestMediaInsideSerializedJson: assert len(media) == 1 and media[0][0] == "/tmp/v.ogg" assert media[0][1] is True # voice flag + # --- cleaned-text invariants: real tags stripped, JSON data kept verbatim --- + + def test_json_embedded_media_kept_verbatim_in_cleaned_text(self): + """A real tag is delivered+stripped; a JSON-embedded MEDIA: stays as + literal text (stored data must read back unchanged).""" + content = 'MEDIA:/real/r.png\nlog: {"old":"MEDIA:/stale/s.png"}' + media, cleaned = BasePlatformAdapter.extract_media(content) + assert [p for p, _ in media] == ["/real/r.png"] + # The JSON-embedded path must survive verbatim — not blanked to spaces. + assert '{"old":"MEDIA:/stale/s.png"}' in cleaned + + def test_cleaned_text_after_directive_not_truncated(self): + """Stripping a tag preceded by a [[as_document]] directive must not + shift offsets and chop the path or trailing text.""" + content = "See [[as_document]] MEDIA:/d/report.pdf now" + media, cleaned = BasePlatformAdapter.extract_media(content) + assert [p for p, _ in media] == ["/d/report.pdf"] + assert "MEDIA:" not in cleaned # real tag removed + assert cleaned.endswith("now") # trailing text intact (not chopped) + class TestMediaExtensionAllowlistParity: """Regression coverage for issue #34517 — the MEDIA: extension black hole.