From e8827ef704ac0ca239f7cca4cf170f9d1f08e58e Mon Sep 17 00:00:00 2001
From: liuhao1024 <sunsky.lau@gmail.com>
Date: Mon, 1 Jun 2026 11:39:40 +0530
Subject: [PATCH] fix(gateway): skip MEDIA: inside serialized JSON string
 values

Serialized tool results frequently embed a prior reply's text, e.g.
{"result": "MEDIA:/path/stale.png"}. The bare-path branch of
MEDIA_TAG_CLEANUP_RE matched these and re-delivered stale files (#34375).

Adds BasePlatformAdapter._mask_json_string_media, which blanks (offset-
preserving) only MEDIA:<bare-path> tokens that sit inside a JSON value-
context string (opened by : , { or [). Legitimate tags at line start,
after prose, indented, MEDIA:"quoted" form, and two-line TTS output are
all left untouched.

Reworked from the approach in #34388 (a line-start regex anchor), which
no longer applied to current main and regressed same-line/indented tags.

Co-authored-by: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com>
---
 gateway/platforms/base.py           | 49 ++++++++++++++++++++-
 tests/gateway/test_platform_base.py | 67 +++++++++++++++++++++++++++++
 2 files changed, 114 insertions(+), 2 deletions(-)
diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py
index baa25b602..e75400085 100644
--- a/gateway/platforms/base.py
+++ b/gateway/platforms/base.py
@@ -2579,6 +2579,46 @@ class BasePlatformAdapter(ABC):
                 logger.warning("Skipping unsafe local file path: %s", _log_safe_path(raw))
         return safe_paths
 
+    @staticmethod
+    def _mask_json_string_media(content: str) -> str:
+        """Blank out ``MEDIA:<bare-path>`` occurrences that sit inside a JSON
+        string *value* so they are never delivered as real attachments.
+
+        Serialized tool results frequently embed a previous reply's text, e.g.::
+
+            {"result": "MEDIA:/Users/x/.hermes/media/generated/stale.png"}
+
+        Here the ``MEDIA:`` is part of stored text, not an outbound directive,
+        but the bare-path branch of ``MEDIA_TAG_CLEANUP_RE`` would still match it
+        and re-deliver a stale file. (Regression report #34375.)
+
+        The discriminator is precise so legitimate tags are untouched:
+
+        * Only spans opened by a JSON value-context quote (``:``, ``,``, ``{`` or
+          ``[`` immediately before the ``"``) are considered.
+        * Within such a span, only a ``MEDIA:`` followed by a **bare** path
+          (``/``, ``~/`` or ``X:\\``) is masked. A ``MEDIA:"..."`` quoted-path
+          tag — a real LLM output format the extractor supports — is not bare and
+          is left alone.
+        * Tags at line start, after prose whitespace, or indented are outside any
+          JSON value span and are never affected.
+
+        Offsets are preserved (matched chars replaced with spaces, newlines kept)
+        so downstream match positions stay valid.
+        """
+        if '"' not in content or "MEDIA:" not in content:
+            return content
+        chars = list(content)
+        # JSON value-context string: a quote preceded by : , { or [ (optional ws),
+        # capturing the (escape-aware) string body up to the closing quote.
+        for m in re.finditer(r'(?<=[:,{\[])\s*"((?:[^"\\\n]|\\.)*)"', content):
+            seg = m.group(1)
+            if re.search(r'MEDIA:\s*(?:~/|/|[A-Za-z]:[/\\])', seg):
+                for i in range(m.start(1), m.end(1)):
+                    if chars[i] != '\n':
+                        chars[i] = ' '
+        return ''.join(chars)
+
     @staticmethod
     def extract_media(content: str) -> Tuple[List[Tuple[str, bool]], str]:
         """
@@ -2621,7 +2661,10 @@ class BasePlatformAdapter(ABC):
         # set is the shared MEDIA_DELIVERY_EXTS source of truth (built once into
         # MEDIA_TAG_CLEANUP_RE) so it can never drift from extract_local_files.
         media_pattern = MEDIA_TAG_CLEANUP_RE
-        for match in media_pattern.finditer(content):
+        # Mask MEDIA: embedded inside serialized JSON string values so stale
+        # paths from stored tool results are never re-delivered (#34375).
+        scan_content = BasePlatformAdapter._mask_json_string_media(content)
+        for match in media_pattern.finditer(scan_content):
             path = match.group("path").strip()
             if len(path) >= 2 and path[0] == path[-1] and path[0] in "`\"'":
                 path = path[1:-1].strip()
@@ -2636,7 +2679,9 @@ class BasePlatformAdapter(ABC):
 
         # Remove MEDIA tags from content (including surrounding quote/backtick wrappers)
         if media:
-            cleaned = media_pattern.sub('', cleaned)
+            # Mask JSON-embedded tags before sub so they stay intact in the
+            # user-visible text (they are stored data, not directives).
+            cleaned = media_pattern.sub('', BasePlatformAdapter._mask_json_string_media(cleaned))
             cleaned = re.sub(r'\n{3,}', '\n\n', cleaned).strip()
         
         return media, cleaned
diff --git a/tests/gateway/test_platform_base.py b/tests/gateway/test_platform_base.py
index c38070317..4fe61adf2 100644
--- a/tests/gateway/test_platform_base.py
+++ b/tests/gateway/test_platform_base.py
@@ -401,6 +401,73 @@ class TestExtractMedia:
         assert media == []
 
 
+class TestMediaInsideSerializedJson:
+    """Regression coverage for #34375 — MEDIA: embedded in serialized JSON
+    string values (e.g. a stored previous reply inside a tool result) must not
+    be re-delivered as a real attachment, while legitimate MEDIA: tags in prose,
+    at line start, indented, or as quoted-path tags keep working.
+    """
+
+    def test_media_in_json_value_not_extracted(self):
+        content = '{"result": "MEDIA:/tmp/stale.png"}'
+        media, _ = BasePlatformAdapter.extract_media(content)
+        assert media == [], f"JSON value MEDIA: leaked: {media}"
+
+    def test_media_in_pretty_json_value_not_extracted(self):
+        content = '{\n  "tool_result": "MEDIA:/var/old.jpg"\n}'
+        media, _ = BasePlatformAdapter.extract_media(content)
+        assert media == [], f"pretty JSON MEDIA: leaked: {media}"
+
+    def test_media_in_json_array_not_extracted(self):
+        content = '["MEDIA:/a/b.png", "other"]'
+        media, _ = BasePlatformAdapter.extract_media(content)
+        assert media == [], f"JSON array MEDIA: leaked: {media}"
+
+    def test_media_in_nested_json_value_not_extracted(self):
+        content = '{"a":{"b":"see MEDIA:/x/y.pdf here"}}'
+        media, _ = BasePlatformAdapter.extract_media(content)
+        assert media == [], f"nested JSON MEDIA: leaked: {media}"
+
+    def test_media_in_embedded_serialized_reply_not_extracted(self):
+        """A serialized tool result that embeds a prior reply's MEDIA: tag."""
+        content = (
+            '{"content":"previous reply MEDIA:/Users/ex/.hermes/media/'
+            'generated/stale.png and more text"}'
+        )
+        media, _ = BasePlatformAdapter.extract_media(content)
+        assert media == [], f"embedded serialized reply leaked: {media}"
+
+    # --- Legitimate tags must still extract (no regression vs line-start anchor) ---
+
+    def test_media_at_line_start_still_extracted(self):
+        media, _ = BasePlatformAdapter.extract_media("MEDIA:/real/file.png")
+        assert len(media) == 1 and media[0][0] == "/real/file.png"
+
+    def test_media_after_prose_same_line_still_extracted(self):
+        media, _ = BasePlatformAdapter.extract_media(
+            "Here is your file: MEDIA:/out/report.pdf"
+        )
+        assert len(media) == 1 and media[0][0] == "/out/report.pdf"
+
+    def test_media_indented_still_extracted(self):
+        media, _ = BasePlatformAdapter.extract_media("  MEDIA:/tmp/x.png")
+        assert len(media) == 1 and media[0][0] == "/tmp/x.png"
+
+    def test_quoted_path_media_still_extracted(self):
+        """MEDIA:"..." quoted-path form (a real LLM output) is not JSON-masked."""
+        media, _ = BasePlatformAdapter.extract_media(
+            'MEDIA:"/path/with space/file.png"'
+        )
+        assert len(media) == 1 and media[0][0] == "/path/with space/file.png"
+
+    def test_tts_two_line_still_extracted(self):
+        media, _ = BasePlatformAdapter.extract_media(
+            "[[audio_as_voice]]\nMEDIA:/tmp/v.ogg"
+        )
+        assert len(media) == 1 and media[0][0] == "/tmp/v.ogg"
+        assert media[0][1] is True  # voice flag
+
+
 class TestMediaExtensionAllowlistParity:
     """Regression coverage for issue #34517 — the MEDIA: extension black hole.