fix(gateway): skip MEDIA: inside serialized JSON string values

Serialized tool results frequently embed a prior reply's text, e.g.
{"result": "MEDIA:/path/stale.png"}. The bare-path branch of
MEDIA_TAG_CLEANUP_RE matched these and re-delivered stale files (#34375).

Adds BasePlatformAdapter._mask_json_string_media, which blanks (offset-
preserving) only MEDIA:<bare-path> tokens that sit inside a JSON value-
context string (opened by : , { or [). Legitimate tags at line start,
after prose, indented, MEDIA:"quoted" form, and two-line TTS output are
all left untouched.

Reworked from the approach in #34388 (a line-start regex anchor), which
no longer applied to current main and regressed same-line/indented tags.

Co-authored-by: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com>
This commit is contained in:
liuhao1024
2026-06-01 11:39:40 +05:30
committed by kshitij
parent b3aaf2676b
commit e8827ef704
2 changed files with 114 additions and 2 deletions

View File

@ -2579,6 +2579,46 @@ class BasePlatformAdapter(ABC):
logger.warning("Skipping unsafe local file path: %s", _log_safe_path(raw))
return safe_paths
@staticmethod
def _mask_json_string_media(content: str) -> str:
"""Blank out ``MEDIA:<bare-path>`` occurrences that sit inside a JSON
string *value* so they are never delivered as real attachments.
Serialized tool results frequently embed a previous reply's text, e.g.::
{"result": "MEDIA:/Users/x/.hermes/media/generated/stale.png"}
Here the ``MEDIA:`` is part of stored text, not an outbound directive,
but the bare-path branch of ``MEDIA_TAG_CLEANUP_RE`` would still match it
and re-deliver a stale file. (Regression report #34375.)
The discriminator is precise so legitimate tags are untouched:
* Only spans opened by a JSON value-context quote (``:``, ``,``, ``{`` or
``[`` immediately before the ``"``) are considered.
* Within such a span, only a ``MEDIA:`` followed by a **bare** path
(``/``, ``~/`` or ``X:\\``) is masked. A ``MEDIA:"..."`` quoted-path
tag — a real LLM output format the extractor supports — is not bare and
is left alone.
* Tags at line start, after prose whitespace, or indented are outside any
JSON value span and are never affected.
Offsets are preserved (matched chars replaced with spaces, newlines kept)
so downstream match positions stay valid.
"""
if '"' not in content or "MEDIA:" not in content:
return content
chars = list(content)
# JSON value-context string: a quote preceded by : , { or [ (optional ws),
# capturing the (escape-aware) string body up to the closing quote.
for m in re.finditer(r'(?<=[:,{\[])\s*"((?:[^"\\\n]|\\.)*)"', content):
seg = m.group(1)
if re.search(r'MEDIA:\s*(?:~/|/|[A-Za-z]:[/\\])', seg):
for i in range(m.start(1), m.end(1)):
if chars[i] != '\n':
chars[i] = ' '
return ''.join(chars)
@staticmethod
def extract_media(content: str) -> Tuple[List[Tuple[str, bool]], str]:
"""
@ -2621,7 +2661,10 @@ class BasePlatformAdapter(ABC):
# set is the shared MEDIA_DELIVERY_EXTS source of truth (built once into
# MEDIA_TAG_CLEANUP_RE) so it can never drift from extract_local_files.
media_pattern = MEDIA_TAG_CLEANUP_RE
for match in media_pattern.finditer(content):
# Mask MEDIA: embedded inside serialized JSON string values so stale
# paths from stored tool results are never re-delivered (#34375).
scan_content = BasePlatformAdapter._mask_json_string_media(content)
for match in media_pattern.finditer(scan_content):
path = match.group("path").strip()
if len(path) >= 2 and path[0] == path[-1] and path[0] in "`\"'":
path = path[1:-1].strip()
@ -2636,7 +2679,9 @@ class BasePlatformAdapter(ABC):
# Remove MEDIA tags from content (including surrounding quote/backtick wrappers)
if media:
cleaned = media_pattern.sub('', cleaned)
# Mask JSON-embedded tags before sub so they stay intact in the
# user-visible text (they are stored data, not directives).
cleaned = media_pattern.sub('', BasePlatformAdapter._mask_json_string_media(cleaned))
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned).strip()
return media, cleaned

View File

@ -401,6 +401,73 @@ class TestExtractMedia:
assert media == []
class TestMediaInsideSerializedJson:
"""Regression coverage for #34375 — MEDIA: embedded in serialized JSON
string values (e.g. a stored previous reply inside a tool result) must not
be re-delivered as a real attachment, while legitimate MEDIA: tags in prose,
at line start, indented, or as quoted-path tags keep working.
"""
def test_media_in_json_value_not_extracted(self):
content = '{"result": "MEDIA:/tmp/stale.png"}'
media, _ = BasePlatformAdapter.extract_media(content)
assert media == [], f"JSON value MEDIA: leaked: {media}"
def test_media_in_pretty_json_value_not_extracted(self):
content = '{\n "tool_result": "MEDIA:/var/old.jpg"\n}'
media, _ = BasePlatformAdapter.extract_media(content)
assert media == [], f"pretty JSON MEDIA: leaked: {media}"
def test_media_in_json_array_not_extracted(self):
content = '["MEDIA:/a/b.png", "other"]'
media, _ = BasePlatformAdapter.extract_media(content)
assert media == [], f"JSON array MEDIA: leaked: {media}"
def test_media_in_nested_json_value_not_extracted(self):
content = '{"a":{"b":"see MEDIA:/x/y.pdf here"}}'
media, _ = BasePlatformAdapter.extract_media(content)
assert media == [], f"nested JSON MEDIA: leaked: {media}"
def test_media_in_embedded_serialized_reply_not_extracted(self):
"""A serialized tool result that embeds a prior reply's MEDIA: tag."""
content = (
'{"content":"previous reply MEDIA:/Users/ex/.hermes/media/'
'generated/stale.png and more text"}'
)
media, _ = BasePlatformAdapter.extract_media(content)
assert media == [], f"embedded serialized reply leaked: {media}"
# --- Legitimate tags must still extract (no regression vs line-start anchor) ---
def test_media_at_line_start_still_extracted(self):
media, _ = BasePlatformAdapter.extract_media("MEDIA:/real/file.png")
assert len(media) == 1 and media[0][0] == "/real/file.png"
def test_media_after_prose_same_line_still_extracted(self):
media, _ = BasePlatformAdapter.extract_media(
"Here is your file: MEDIA:/out/report.pdf"
)
assert len(media) == 1 and media[0][0] == "/out/report.pdf"
def test_media_indented_still_extracted(self):
media, _ = BasePlatformAdapter.extract_media(" MEDIA:/tmp/x.png")
assert len(media) == 1 and media[0][0] == "/tmp/x.png"
def test_quoted_path_media_still_extracted(self):
"""MEDIA:"..." quoted-path form (a real LLM output) is not JSON-masked."""
media, _ = BasePlatformAdapter.extract_media(
'MEDIA:"/path/with space/file.png"'
)
assert len(media) == 1 and media[0][0] == "/path/with space/file.png"
def test_tts_two_line_still_extracted(self):
media, _ = BasePlatformAdapter.extract_media(
"[[audio_as_voice]]\nMEDIA:/tmp/v.ogg"
)
assert len(media) == 1 and media[0][0] == "/tmp/v.ogg"
assert media[0][1] is True # voice flag
class TestMediaExtensionAllowlistParity:
"""Regression coverage for issue #34517 — the MEDIA: extension black hole.