fix(gateway): keep JSON-embedded MEDIA: text verbatim in cleaned output

Self-review of #34375 fix: the cleanup path ran media_pattern.sub('') over
the JSON-masked copy of the text, which baked the masking spaces into the
user-visible 'cleaned' string — a serialized tool result like
{"old":"MEDIA:/x.png"} came back as {"old":"          "}.

Now mask only a length-equal copy of 'cleaned' to locate the real tag spans,
then delete those spans from the unmasked 'cleaned'. Real tags are stripped;
JSON-embedded MEDIA: text reads back verbatim. Masking 'cleaned' (not the
original 'content') keeps offsets valid after the [[audio_as_voice]] /
[[as_document]] directives are removed. Adds two cleaned-text regression tests.
This commit is contained in:
kshitijk4poor
2026-06-01 11:52:57 +05:30
committed by kshitij
parent e8827ef704
commit fb1b681b3b
2 changed files with 35 additions and 5 deletions

View File

@ -2677,12 +2677,22 @@ class BasePlatformAdapter(ABC):
# and dropping every other attachment in the response.
continue
# Remove MEDIA tags from content (including surrounding quote/backtick wrappers)
# Remove the delivered MEDIA tags from the user-visible text. Mask
# ``cleaned`` (same length, so offsets line up with it), find the real
# tag spans there, and delete those spans from the *unmasked* ``cleaned``.
# This strips real tags while leaving JSON-embedded MEDIA: text intact —
# it is stored data, not a directive, and must read back verbatim
# (#34375). Masking ``cleaned`` (not ``content``) keeps offsets valid
# after the [[audio_as_voice]] / [[as_document]] directives are removed.
if media:
# Mask JSON-embedded tags before sub so they stay intact in the
# user-visible text (they are stored data, not directives).
cleaned = media_pattern.sub('', BasePlatformAdapter._mask_json_string_media(cleaned))
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned).strip()
masked_cleaned = BasePlatformAdapter._mask_json_string_media(cleaned)
spans = [m.span() for m in media_pattern.finditer(masked_cleaned)]
if spans:
chars = list(cleaned)
for start, end in sorted(spans, reverse=True):
del chars[start:end]
cleaned = "".join(chars)
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned).strip()
return media, cleaned

View File

@ -467,6 +467,26 @@ class TestMediaInsideSerializedJson:
assert len(media) == 1 and media[0][0] == "/tmp/v.ogg"
assert media[0][1] is True # voice flag
# --- cleaned-text invariants: real tags stripped, JSON data kept verbatim ---
def test_json_embedded_media_kept_verbatim_in_cleaned_text(self):
"""A real tag is delivered+stripped; a JSON-embedded MEDIA: stays as
literal text (stored data must read back unchanged)."""
content = 'MEDIA:/real/r.png\nlog: {"old":"MEDIA:/stale/s.png"}'
media, cleaned = BasePlatformAdapter.extract_media(content)
assert [p for p, _ in media] == ["/real/r.png"]
# The JSON-embedded path must survive verbatim — not blanked to spaces.
assert '{"old":"MEDIA:/stale/s.png"}' in cleaned
def test_cleaned_text_after_directive_not_truncated(self):
"""Stripping a tag preceded by a [[as_document]] directive must not
shift offsets and chop the path or trailing text."""
content = "See [[as_document]] MEDIA:/d/report.pdf now"
media, cleaned = BasePlatformAdapter.extract_media(content)
assert [p for p, _ in media] == ["/d/report.pdf"]
assert "MEDIA:" not in cleaned # real tag removed
assert cleaned.endswith("now") # trailing text intact (not chopped)
class TestMediaExtensionAllowlistParity:
"""Regression coverage for issue #34517 — the MEDIA: extension black hole.