fix(gateway): skip MEDIA: inside serialized JSON string values
Serialized tool results frequently embed a prior reply's text, e.g.
{"result": "MEDIA:/path/stale.png"}. The bare-path branch of
MEDIA_TAG_CLEANUP_RE matched these and re-delivered stale files (#34375).
Adds BasePlatformAdapter._mask_json_string_media, which blanks (offset-
preserving) only MEDIA:<bare-path> tokens that sit inside a JSON value-
context string (opened by : , { or [). Legitimate tags at line start,
after prose, indented, MEDIA:"quoted" form, and two-line TTS output are
all left untouched.
Reworked from the approach in #34388 (a line-start regex anchor), which
no longer applied to current main and regressed same-line/indented tags.
Co-authored-by: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com>
This commit is contained in:
@ -2579,6 +2579,46 @@ class BasePlatformAdapter(ABC):
|
||||
logger.warning("Skipping unsafe local file path: %s", _log_safe_path(raw))
|
||||
return safe_paths
|
||||
|
||||
@staticmethod
|
||||
def _mask_json_string_media(content: str) -> str:
|
||||
"""Blank out ``MEDIA:<bare-path>`` occurrences that sit inside a JSON
|
||||
string *value* so they are never delivered as real attachments.
|
||||
|
||||
Serialized tool results frequently embed a previous reply's text, e.g.::
|
||||
|
||||
{"result": "MEDIA:/Users/x/.hermes/media/generated/stale.png"}
|
||||
|
||||
Here the ``MEDIA:`` is part of stored text, not an outbound directive,
|
||||
but the bare-path branch of ``MEDIA_TAG_CLEANUP_RE`` would still match it
|
||||
and re-deliver a stale file. (Regression report #34375.)
|
||||
|
||||
The discriminator is precise so legitimate tags are untouched:
|
||||
|
||||
* Only spans opened by a JSON value-context quote (``:``, ``,``, ``{`` or
|
||||
``[`` immediately before the ``"``) are considered.
|
||||
* Within such a span, only a ``MEDIA:`` followed by a **bare** path
|
||||
(``/``, ``~/`` or ``X:\\``) is masked. A ``MEDIA:"..."`` quoted-path
|
||||
tag — a real LLM output format the extractor supports — is not bare and
|
||||
is left alone.
|
||||
* Tags at line start, after prose whitespace, or indented are outside any
|
||||
JSON value span and are never affected.
|
||||
|
||||
Offsets are preserved (matched chars replaced with spaces, newlines kept)
|
||||
so downstream match positions stay valid.
|
||||
"""
|
||||
if '"' not in content or "MEDIA:" not in content:
|
||||
return content
|
||||
chars = list(content)
|
||||
# JSON value-context string: a quote preceded by : , { or [ (optional ws),
|
||||
# capturing the (escape-aware) string body up to the closing quote.
|
||||
for m in re.finditer(r'(?<=[:,{\[])\s*"((?:[^"\\\n]|\\.)*)"', content):
|
||||
seg = m.group(1)
|
||||
if re.search(r'MEDIA:\s*(?:~/|/|[A-Za-z]:[/\\])', seg):
|
||||
for i in range(m.start(1), m.end(1)):
|
||||
if chars[i] != '\n':
|
||||
chars[i] = ' '
|
||||
return ''.join(chars)
|
||||
|
||||
@staticmethod
|
||||
def extract_media(content: str) -> Tuple[List[Tuple[str, bool]], str]:
|
||||
"""
|
||||
@ -2621,7 +2661,10 @@ class BasePlatformAdapter(ABC):
|
||||
# set is the shared MEDIA_DELIVERY_EXTS source of truth (built once into
|
||||
# MEDIA_TAG_CLEANUP_RE) so it can never drift from extract_local_files.
|
||||
media_pattern = MEDIA_TAG_CLEANUP_RE
|
||||
for match in media_pattern.finditer(content):
|
||||
# Mask MEDIA: embedded inside serialized JSON string values so stale
|
||||
# paths from stored tool results are never re-delivered (#34375).
|
||||
scan_content = BasePlatformAdapter._mask_json_string_media(content)
|
||||
for match in media_pattern.finditer(scan_content):
|
||||
path = match.group("path").strip()
|
||||
if len(path) >= 2 and path[0] == path[-1] and path[0] in "`\"'":
|
||||
path = path[1:-1].strip()
|
||||
@ -2636,7 +2679,9 @@ class BasePlatformAdapter(ABC):
|
||||
|
||||
# Remove MEDIA tags from content (including surrounding quote/backtick wrappers)
|
||||
if media:
|
||||
cleaned = media_pattern.sub('', cleaned)
|
||||
# Mask JSON-embedded tags before sub so they stay intact in the
|
||||
# user-visible text (they are stored data, not directives).
|
||||
cleaned = media_pattern.sub('', BasePlatformAdapter._mask_json_string_media(cleaned))
|
||||
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned).strip()
|
||||
|
||||
return media, cleaned
|
||||
|
||||
@ -401,6 +401,73 @@ class TestExtractMedia:
|
||||
assert media == []
|
||||
|
||||
|
||||
class TestMediaInsideSerializedJson:
|
||||
"""Regression coverage for #34375 — MEDIA: embedded in serialized JSON
|
||||
string values (e.g. a stored previous reply inside a tool result) must not
|
||||
be re-delivered as a real attachment, while legitimate MEDIA: tags in prose,
|
||||
at line start, indented, or as quoted-path tags keep working.
|
||||
"""
|
||||
|
||||
def test_media_in_json_value_not_extracted(self):
|
||||
content = '{"result": "MEDIA:/tmp/stale.png"}'
|
||||
media, _ = BasePlatformAdapter.extract_media(content)
|
||||
assert media == [], f"JSON value MEDIA: leaked: {media}"
|
||||
|
||||
def test_media_in_pretty_json_value_not_extracted(self):
|
||||
content = '{\n "tool_result": "MEDIA:/var/old.jpg"\n}'
|
||||
media, _ = BasePlatformAdapter.extract_media(content)
|
||||
assert media == [], f"pretty JSON MEDIA: leaked: {media}"
|
||||
|
||||
def test_media_in_json_array_not_extracted(self):
|
||||
content = '["MEDIA:/a/b.png", "other"]'
|
||||
media, _ = BasePlatformAdapter.extract_media(content)
|
||||
assert media == [], f"JSON array MEDIA: leaked: {media}"
|
||||
|
||||
def test_media_in_nested_json_value_not_extracted(self):
|
||||
content = '{"a":{"b":"see MEDIA:/x/y.pdf here"}}'
|
||||
media, _ = BasePlatformAdapter.extract_media(content)
|
||||
assert media == [], f"nested JSON MEDIA: leaked: {media}"
|
||||
|
||||
def test_media_in_embedded_serialized_reply_not_extracted(self):
|
||||
"""A serialized tool result that embeds a prior reply's MEDIA: tag."""
|
||||
content = (
|
||||
'{"content":"previous reply MEDIA:/Users/ex/.hermes/media/'
|
||||
'generated/stale.png and more text"}'
|
||||
)
|
||||
media, _ = BasePlatformAdapter.extract_media(content)
|
||||
assert media == [], f"embedded serialized reply leaked: {media}"
|
||||
|
||||
# --- Legitimate tags must still extract (no regression vs line-start anchor) ---
|
||||
|
||||
def test_media_at_line_start_still_extracted(self):
|
||||
media, _ = BasePlatformAdapter.extract_media("MEDIA:/real/file.png")
|
||||
assert len(media) == 1 and media[0][0] == "/real/file.png"
|
||||
|
||||
def test_media_after_prose_same_line_still_extracted(self):
|
||||
media, _ = BasePlatformAdapter.extract_media(
|
||||
"Here is your file: MEDIA:/out/report.pdf"
|
||||
)
|
||||
assert len(media) == 1 and media[0][0] == "/out/report.pdf"
|
||||
|
||||
def test_media_indented_still_extracted(self):
|
||||
media, _ = BasePlatformAdapter.extract_media(" MEDIA:/tmp/x.png")
|
||||
assert len(media) == 1 and media[0][0] == "/tmp/x.png"
|
||||
|
||||
def test_quoted_path_media_still_extracted(self):
|
||||
"""MEDIA:"..." quoted-path form (a real LLM output) is not JSON-masked."""
|
||||
media, _ = BasePlatformAdapter.extract_media(
|
||||
'MEDIA:"/path/with space/file.png"'
|
||||
)
|
||||
assert len(media) == 1 and media[0][0] == "/path/with space/file.png"
|
||||
|
||||
def test_tts_two_line_still_extracted(self):
|
||||
media, _ = BasePlatformAdapter.extract_media(
|
||||
"[[audio_as_voice]]\nMEDIA:/tmp/v.ogg"
|
||||
)
|
||||
assert len(media) == 1 and media[0][0] == "/tmp/v.ogg"
|
||||
assert media[0][1] is True # voice flag
|
||||
|
||||
|
||||
class TestMediaExtensionAllowlistParity:
|
||||
"""Regression coverage for issue #34517 — the MEDIA: extension black hole.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user