From e8827ef704ac0ca239f7cca4cf170f9d1f08e58e Mon Sep 17 00:00:00 2001 From: liuhao1024 Date: Mon, 1 Jun 2026 11:39:40 +0530 Subject: [PATCH] fix(gateway): skip MEDIA: inside serialized JSON string values Serialized tool results frequently embed a prior reply's text, e.g. {"result": "MEDIA:/path/stale.png"}. The bare-path branch of MEDIA_TAG_CLEANUP_RE matched these and re-delivered stale files (#34375). Adds BasePlatformAdapter._mask_json_string_media, which blanks (offset- preserving) only MEDIA: tokens that sit inside a JSON value- context string (opened by : , { or [). Legitimate tags at line start, after prose, indented, MEDIA:"quoted" form, and two-line TTS output are all left untouched. Reworked from the approach in #34388 (a line-start regex anchor), which no longer applied to current main and regressed same-line/indented tags. Co-authored-by: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com> --- gateway/platforms/base.py | 49 ++++++++++++++++++++- tests/gateway/test_platform_base.py | 67 +++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+), 2 deletions(-) diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py index baa25b602..e75400085 100644 --- a/gateway/platforms/base.py +++ b/gateway/platforms/base.py @@ -2579,6 +2579,46 @@ class BasePlatformAdapter(ABC): logger.warning("Skipping unsafe local file path: %s", _log_safe_path(raw)) return safe_paths + @staticmethod + def _mask_json_string_media(content: str) -> str: + """Blank out ``MEDIA:`` occurrences that sit inside a JSON + string *value* so they are never delivered as real attachments. + + Serialized tool results frequently embed a previous reply's text, e.g.:: + + {"result": "MEDIA:/Users/x/.hermes/media/generated/stale.png"} + + Here the ``MEDIA:`` is part of stored text, not an outbound directive, + but the bare-path branch of ``MEDIA_TAG_CLEANUP_RE`` would still match it + and re-deliver a stale file. (Regression report #34375.) + + The discriminator is precise so legitimate tags are untouched: + + * Only spans opened by a JSON value-context quote (``:``, ``,``, ``{`` or + ``[`` immediately before the ``"``) are considered. + * Within such a span, only a ``MEDIA:`` followed by a **bare** path + (``/``, ``~/`` or ``X:\\``) is masked. A ``MEDIA:"..."`` quoted-path + tag — a real LLM output format the extractor supports — is not bare and + is left alone. + * Tags at line start, after prose whitespace, or indented are outside any + JSON value span and are never affected. + + Offsets are preserved (matched chars replaced with spaces, newlines kept) + so downstream match positions stay valid. + """ + if '"' not in content or "MEDIA:" not in content: + return content + chars = list(content) + # JSON value-context string: a quote preceded by : , { or [ (optional ws), + # capturing the (escape-aware) string body up to the closing quote. + for m in re.finditer(r'(?<=[:,{\[])\s*"((?:[^"\\\n]|\\.)*)"', content): + seg = m.group(1) + if re.search(r'MEDIA:\s*(?:~/|/|[A-Za-z]:[/\\])', seg): + for i in range(m.start(1), m.end(1)): + if chars[i] != '\n': + chars[i] = ' ' + return ''.join(chars) + @staticmethod def extract_media(content: str) -> Tuple[List[Tuple[str, bool]], str]: """ @@ -2621,7 +2661,10 @@ class BasePlatformAdapter(ABC): # set is the shared MEDIA_DELIVERY_EXTS source of truth (built once into # MEDIA_TAG_CLEANUP_RE) so it can never drift from extract_local_files. media_pattern = MEDIA_TAG_CLEANUP_RE - for match in media_pattern.finditer(content): + # Mask MEDIA: embedded inside serialized JSON string values so stale + # paths from stored tool results are never re-delivered (#34375). + scan_content = BasePlatformAdapter._mask_json_string_media(content) + for match in media_pattern.finditer(scan_content): path = match.group("path").strip() if len(path) >= 2 and path[0] == path[-1] and path[0] in "`\"'": path = path[1:-1].strip() @@ -2636,7 +2679,9 @@ class BasePlatformAdapter(ABC): # Remove MEDIA tags from content (including surrounding quote/backtick wrappers) if media: - cleaned = media_pattern.sub('', cleaned) + # Mask JSON-embedded tags before sub so they stay intact in the + # user-visible text (they are stored data, not directives). + cleaned = media_pattern.sub('', BasePlatformAdapter._mask_json_string_media(cleaned)) cleaned = re.sub(r'\n{3,}', '\n\n', cleaned).strip() return media, cleaned diff --git a/tests/gateway/test_platform_base.py b/tests/gateway/test_platform_base.py index c38070317..4fe61adf2 100644 --- a/tests/gateway/test_platform_base.py +++ b/tests/gateway/test_platform_base.py @@ -401,6 +401,73 @@ class TestExtractMedia: assert media == [] +class TestMediaInsideSerializedJson: + """Regression coverage for #34375 — MEDIA: embedded in serialized JSON + string values (e.g. a stored previous reply inside a tool result) must not + be re-delivered as a real attachment, while legitimate MEDIA: tags in prose, + at line start, indented, or as quoted-path tags keep working. + """ + + def test_media_in_json_value_not_extracted(self): + content = '{"result": "MEDIA:/tmp/stale.png"}' + media, _ = BasePlatformAdapter.extract_media(content) + assert media == [], f"JSON value MEDIA: leaked: {media}" + + def test_media_in_pretty_json_value_not_extracted(self): + content = '{\n "tool_result": "MEDIA:/var/old.jpg"\n}' + media, _ = BasePlatformAdapter.extract_media(content) + assert media == [], f"pretty JSON MEDIA: leaked: {media}" + + def test_media_in_json_array_not_extracted(self): + content = '["MEDIA:/a/b.png", "other"]' + media, _ = BasePlatformAdapter.extract_media(content) + assert media == [], f"JSON array MEDIA: leaked: {media}" + + def test_media_in_nested_json_value_not_extracted(self): + content = '{"a":{"b":"see MEDIA:/x/y.pdf here"}}' + media, _ = BasePlatformAdapter.extract_media(content) + assert media == [], f"nested JSON MEDIA: leaked: {media}" + + def test_media_in_embedded_serialized_reply_not_extracted(self): + """A serialized tool result that embeds a prior reply's MEDIA: tag.""" + content = ( + '{"content":"previous reply MEDIA:/Users/ex/.hermes/media/' + 'generated/stale.png and more text"}' + ) + media, _ = BasePlatformAdapter.extract_media(content) + assert media == [], f"embedded serialized reply leaked: {media}" + + # --- Legitimate tags must still extract (no regression vs line-start anchor) --- + + def test_media_at_line_start_still_extracted(self): + media, _ = BasePlatformAdapter.extract_media("MEDIA:/real/file.png") + assert len(media) == 1 and media[0][0] == "/real/file.png" + + def test_media_after_prose_same_line_still_extracted(self): + media, _ = BasePlatformAdapter.extract_media( + "Here is your file: MEDIA:/out/report.pdf" + ) + assert len(media) == 1 and media[0][0] == "/out/report.pdf" + + def test_media_indented_still_extracted(self): + media, _ = BasePlatformAdapter.extract_media(" MEDIA:/tmp/x.png") + assert len(media) == 1 and media[0][0] == "/tmp/x.png" + + def test_quoted_path_media_still_extracted(self): + """MEDIA:"..." quoted-path form (a real LLM output) is not JSON-masked.""" + media, _ = BasePlatformAdapter.extract_media( + 'MEDIA:"/path/with space/file.png"' + ) + assert len(media) == 1 and media[0][0] == "/path/with space/file.png" + + def test_tts_two_line_still_extracted(self): + media, _ = BasePlatformAdapter.extract_media( + "[[audio_as_voice]]\nMEDIA:/tmp/v.ogg" + ) + assert len(media) == 1 and media[0][0] == "/tmp/v.ogg" + assert media[0][1] is True # voice flag + + class TestMediaExtensionAllowlistParity: """Regression coverage for issue #34517 — the MEDIA: extension black hole.