fix(gateway): keep JSON-embedded MEDIA: text verbatim in cleaned output
Self-review of #34375 fix: the cleanup path ran media_pattern.sub('') over the JSON-masked copy of the text, which baked the masking spaces into the user-visible 'cleaned' string — a serialized tool result like {"old":"MEDIA:/x.png"} came back as {"old":" "}. Now mask only a length-equal copy of 'cleaned' to locate the real tag spans, then delete those spans from the unmasked 'cleaned'. Real tags are stripped; JSON-embedded MEDIA: text reads back verbatim. Masking 'cleaned' (not the original 'content') keeps offsets valid after the [[audio_as_voice]] / [[as_document]] directives are removed. Adds two cleaned-text regression tests.
This commit is contained in:
@ -2677,12 +2677,22 @@ class BasePlatformAdapter(ABC):
|
||||
# and dropping every other attachment in the response.
|
||||
continue
|
||||
|
||||
# Remove MEDIA tags from content (including surrounding quote/backtick wrappers)
|
||||
# Remove the delivered MEDIA tags from the user-visible text. Mask
|
||||
# ``cleaned`` (same length, so offsets line up with it), find the real
|
||||
# tag spans there, and delete those spans from the *unmasked* ``cleaned``.
|
||||
# This strips real tags while leaving JSON-embedded MEDIA: text intact —
|
||||
# it is stored data, not a directive, and must read back verbatim
|
||||
# (#34375). Masking ``cleaned`` (not ``content``) keeps offsets valid
|
||||
# after the [[audio_as_voice]] / [[as_document]] directives are removed.
|
||||
if media:
|
||||
# Mask JSON-embedded tags before sub so they stay intact in the
|
||||
# user-visible text (they are stored data, not directives).
|
||||
cleaned = media_pattern.sub('', BasePlatformAdapter._mask_json_string_media(cleaned))
|
||||
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned).strip()
|
||||
masked_cleaned = BasePlatformAdapter._mask_json_string_media(cleaned)
|
||||
spans = [m.span() for m in media_pattern.finditer(masked_cleaned)]
|
||||
if spans:
|
||||
chars = list(cleaned)
|
||||
for start, end in sorted(spans, reverse=True):
|
||||
del chars[start:end]
|
||||
cleaned = "".join(chars)
|
||||
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned).strip()
|
||||
|
||||
return media, cleaned
|
||||
|
||||
|
||||
@ -467,6 +467,26 @@ class TestMediaInsideSerializedJson:
|
||||
assert len(media) == 1 and media[0][0] == "/tmp/v.ogg"
|
||||
assert media[0][1] is True # voice flag
|
||||
|
||||
# --- cleaned-text invariants: real tags stripped, JSON data kept verbatim ---
|
||||
|
||||
def test_json_embedded_media_kept_verbatim_in_cleaned_text(self):
|
||||
"""A real tag is delivered+stripped; a JSON-embedded MEDIA: stays as
|
||||
literal text (stored data must read back unchanged)."""
|
||||
content = 'MEDIA:/real/r.png\nlog: {"old":"MEDIA:/stale/s.png"}'
|
||||
media, cleaned = BasePlatformAdapter.extract_media(content)
|
||||
assert [p for p, _ in media] == ["/real/r.png"]
|
||||
# The JSON-embedded path must survive verbatim — not blanked to spaces.
|
||||
assert '{"old":"MEDIA:/stale/s.png"}' in cleaned
|
||||
|
||||
def test_cleaned_text_after_directive_not_truncated(self):
|
||||
"""Stripping a tag preceded by a [[as_document]] directive must not
|
||||
shift offsets and chop the path or trailing text."""
|
||||
content = "See [[as_document]] MEDIA:/d/report.pdf now"
|
||||
media, cleaned = BasePlatformAdapter.extract_media(content)
|
||||
assert [p for p, _ in media] == ["/d/report.pdf"]
|
||||
assert "MEDIA:" not in cleaned # real tag removed
|
||||
assert cleaned.endswith("now") # trailing text intact (not chopped)
|
||||
|
||||
|
||||
class TestMediaExtensionAllowlistParity:
|
||||
"""Regression coverage for issue #34517 — the MEDIA: extension black hole.
|
||||
|
||||
Reference in New Issue
Block a user