fix(gateway): skip MEDIA: inside serialized JSON string values

Serialized tool results frequently embed a prior reply's text, e.g. {"result": "MEDIA:/path/stale.png"}. The bare-path branch of MEDIA_TAG_CLEANUP_RE matched these and re-delivered stale files (#34375). Adds BasePlatformAdapter._mask_json_string_media, which blanks (offset- preserving) only MEDIA:<bare-path> tokens that sit inside a JSON value- context string (opened by : , { or [). Legitimate tags at line start, after prose, indented, MEDIA:"quoted" form, and two-line TTS output are all left untouched. Reworked from the approach in #34388 (a line-start regex anchor), which no longer applied to current main and regressed same-line/indented tags. Co-authored-by: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com>
2026-06-01 11:39:40 +05:30
parent b3aaf2676b
commit e8827ef704
2 changed files with 114 additions and 2 deletions
--- a/gateway/platforms/base.py
+++ b/gateway/platforms/base.py
@ -2579,6 +2579,46 @@ class BasePlatformAdapter(ABC):
                logger.warning("Skipping unsafe local file path: %s", _log_safe_path(raw))
        return safe_paths

+    @staticmethod
+    def _mask_json_string_media(content: str) -> str:
+        """Blank out ``MEDIA:<bare-path>`` occurrences that sit inside a JSON
+        string *value* so they are never delivered as real attachments.
+
+        Serialized tool results frequently embed a previous reply's text, e.g.::
+
+            {"result": "MEDIA:/Users/x/.hermes/media/generated/stale.png"}
+
+        Here the ``MEDIA:`` is part of stored text, not an outbound directive,
+        but the bare-path branch of ``MEDIA_TAG_CLEANUP_RE`` would still match it
+        and re-deliver a stale file. (Regression report #34375.)
+
+        The discriminator is precise so legitimate tags are untouched:
+
+        * Only spans opened by a JSON value-context quote (``:``, ``,``, ``{`` or
+          ``[`` immediately before the ``"``) are considered.
+        * Within such a span, only a ``MEDIA:`` followed by a **bare** path
+          (``/``, ``~/`` or ``X:\\``) is masked. A ``MEDIA:"..."`` quoted-path
+          tag — a real LLM output format the extractor supports — is not bare and
+          is left alone.
+        * Tags at line start, after prose whitespace, or indented are outside any
+          JSON value span and are never affected.
+
+        Offsets are preserved (matched chars replaced with spaces, newlines kept)
+        so downstream match positions stay valid.
+        """
+        if '"' not in content or "MEDIA:" not in content:
+            return content
+        chars = list(content)
+        # JSON value-context string: a quote preceded by : , { or [ (optional ws),
+        # capturing the (escape-aware) string body up to the closing quote.
+        for m in re.finditer(r'(?<=[:,{\[])\s*"((?:[^"\\\n]|\\.)*)"', content):
+            seg = m.group(1)
+            if re.search(r'MEDIA:\s*(?:~/|/|[A-Za-z]:[/\\])', seg):
+                for i in range(m.start(1), m.end(1)):
+                    if chars[i] != '\n':
+                        chars[i] = ' '
+        return ''.join(chars)
+
    @staticmethod
    def extract_media(content: str) -> Tuple[List[Tuple[str, bool]], str]:
        """
@ -2621,7 +2661,10 @@ class BasePlatformAdapter(ABC):
        # set is the shared MEDIA_DELIVERY_EXTS source of truth (built once into
        # MEDIA_TAG_CLEANUP_RE) so it can never drift from extract_local_files.
        media_pattern = MEDIA_TAG_CLEANUP_RE
-        for match in media_pattern.finditer(content):
+        # Mask MEDIA: embedded inside serialized JSON string values so stale
+        # paths from stored tool results are never re-delivered (#34375).
+        scan_content = BasePlatformAdapter._mask_json_string_media(content)
+        for match in media_pattern.finditer(scan_content):
            path = match.group("path").strip()
            if len(path) >= 2 and path[0] == path[-1] and path[0] in "`\"'":
                path = path[1:-1].strip()
@ -2636,7 +2679,9 @@ class BasePlatformAdapter(ABC):

        # Remove MEDIA tags from content (including surrounding quote/backtick wrappers)
        if media:
-            cleaned = media_pattern.sub('', cleaned)
+            # Mask JSON-embedded tags before sub so they stay intact in the
+            # user-visible text (they are stored data, not directives).
+            cleaned = media_pattern.sub('', BasePlatformAdapter._mask_json_string_media(cleaned))
            cleaned = re.sub(r'\n{3,}', '\n\n', cleaned).strip()
        
        return media, cleaned
--- a/tests/gateway/test_platform_base.py
+++ b/tests/gateway/test_platform_base.py
@ -401,6 +401,73 @@ class TestExtractMedia:
        assert media == []


+class TestMediaInsideSerializedJson:
+    """Regression coverage for #34375 — MEDIA: embedded in serialized JSON
+    string values (e.g. a stored previous reply inside a tool result) must not
+    be re-delivered as a real attachment, while legitimate MEDIA: tags in prose,
+    at line start, indented, or as quoted-path tags keep working.
+    """
+
+    def test_media_in_json_value_not_extracted(self):
+        content = '{"result": "MEDIA:/tmp/stale.png"}'
+        media, _ = BasePlatformAdapter.extract_media(content)
+        assert media == [], f"JSON value MEDIA: leaked: {media}"
+
+    def test_media_in_pretty_json_value_not_extracted(self):
+        content = '{\n  "tool_result": "MEDIA:/var/old.jpg"\n}'
+        media, _ = BasePlatformAdapter.extract_media(content)
+        assert media == [], f"pretty JSON MEDIA: leaked: {media}"
+
+    def test_media_in_json_array_not_extracted(self):
+        content = '["MEDIA:/a/b.png", "other"]'
+        media, _ = BasePlatformAdapter.extract_media(content)
+        assert media == [], f"JSON array MEDIA: leaked: {media}"
+
+    def test_media_in_nested_json_value_not_extracted(self):
+        content = '{"a":{"b":"see MEDIA:/x/y.pdf here"}}'
+        media, _ = BasePlatformAdapter.extract_media(content)
+        assert media == [], f"nested JSON MEDIA: leaked: {media}"
+
+    def test_media_in_embedded_serialized_reply_not_extracted(self):
+        """A serialized tool result that embeds a prior reply's MEDIA: tag."""
+        content = (
+            '{"content":"previous reply MEDIA:/Users/ex/.hermes/media/'
+            'generated/stale.png and more text"}'
+        )
+        media, _ = BasePlatformAdapter.extract_media(content)
+        assert media == [], f"embedded serialized reply leaked: {media}"
+
+    # --- Legitimate tags must still extract (no regression vs line-start anchor) ---
+
+    def test_media_at_line_start_still_extracted(self):
+        media, _ = BasePlatformAdapter.extract_media("MEDIA:/real/file.png")
+        assert len(media) == 1 and media[0][0] == "/real/file.png"
+
+    def test_media_after_prose_same_line_still_extracted(self):
+        media, _ = BasePlatformAdapter.extract_media(
+            "Here is your file: MEDIA:/out/report.pdf"
+        )
+        assert len(media) == 1 and media[0][0] == "/out/report.pdf"
+
+    def test_media_indented_still_extracted(self):
+        media, _ = BasePlatformAdapter.extract_media("  MEDIA:/tmp/x.png")
+        assert len(media) == 1 and media[0][0] == "/tmp/x.png"
+
+    def test_quoted_path_media_still_extracted(self):
+        """MEDIA:"..." quoted-path form (a real LLM output) is not JSON-masked."""
+        media, _ = BasePlatformAdapter.extract_media(
+            'MEDIA:"/path/with space/file.png"'
+        )
+        assert len(media) == 1 and media[0][0] == "/path/with space/file.png"
+
+    def test_tts_two_line_still_extracted(self):
+        media, _ = BasePlatformAdapter.extract_media(
+            "[[audio_as_voice]]\nMEDIA:/tmp/v.ogg"
+        )
+        assert len(media) == 1 and media[0][0] == "/tmp/v.ogg"
+        assert media[0][1] is True  # voice flag
+
+
 class TestMediaExtensionAllowlistParity:
    """Regression coverage for issue #34517 — the MEDIA: extension black hole.