From 781604ce4c826ec06b69a0bde703ab935308893d Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Fri, 29 May 2026 13:24:01 -0700 Subject: [PATCH] fix(gateway): unify MEDIA: extraction extension set + close the unknown-ext black hole (#34517) (#34844) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MEDIA: tags for .md/.json/.yaml/.xml/.html and other document extensions were silently dropped. extract_media() carried a narrow extension allowlist that omitted them, while extract_local_files() had a broad one. The dispatch sites then ran an unconditional re.sub(r'MEDIA:\\s*\\S+', '') that stripped the tag from the body even when extract_media had not matched it — so extract_local_files (broad list) ran on text where the path was already gone, and the file was delivered by neither path. - Add MEDIA_DELIVERY_EXTS in gateway/platforms/base.py as the single source of truth; extract_media and extract_local_files both derive their extension set from it (no more drift). - Replace the loose MEDIA cleanup at the non-streaming dispatch site (base.py) and the streaming consumer (stream_consumer.py) with the shared, extension-anchored MEDIA_TAG_CLEANUP_RE. A MEDIA: tag with an unknown extension is left in the body so the bare-path detector can still pick it up instead of being black-holed. - Chain cleaned text through extract_media -> extract_images -> extract_local_files in run.py's post-stream media delivery (it was dropping the cleaned text and rescanning raw text with MEDIA: tags). - Regression tests covering both halves: previously-dropped extensions now extract, and unknown-ext paths survive the cleanup. Consolidates the MEDIA extension-allowlist PR cluster. Co-authored-by: Bartok9 <259807879+Bartok9@users.noreply.github.com> Co-authored-by: banditburai <123342691+banditburai@users.noreply.github.com> Co-authored-by: Kyzcreig <9063726+Kyzcreig@users.noreply.github.com> --- gateway/platforms/base.py | 103 +++++++++++++++++++++------- gateway/run.py | 11 ++- gateway/stream_consumer.py | 12 ++-- scripts/release.py | 2 + tests/gateway/test_platform_base.py | 48 +++++++++++++ 5 files changed, 147 insertions(+), 29 deletions(-) diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py index 0d141d0fc..6979a8691 100644 --- a/gateway/platforms/base.py +++ b/gateway/platforms/base.py @@ -1131,6 +1131,75 @@ SUPPORTED_IMAGE_DOCUMENT_TYPES = { } +# --------------------------------------------------------------------------- +# Media-delivery extension allowlist — SINGLE SOURCE OF TRUTH +# +# Both extractors that turn response text into native attachments derive their +# extension set from this tuple: +# * ``extract_media()`` — explicit ``MEDIA:`` tags +# * ``extract_local_files()`` — bare absolute/home paths the agent mentions +# +# Historically these two carried independently-maintained extension lists. +# ``extract_media`` had a narrow list (no .md/.json/.yaml/.xml/.html/...) while +# ``extract_local_files`` had a broad one. Combined with the unconditional +# ``MEDIA:\\s*\\S+`` cleanup at the dispatch sites, that mismatch created a +# silent black hole: a ``MEDIA:/report.md`` tag failed the narrow extract_media +# match, got stripped from the body by the loose cleanup regex, and was then +# invisible to extract_local_files — the file was never delivered (issue +# #34517). Keeping one list eliminates the drift; building the cleanup regexes +# from the same set means a tag is only stripped when its extension is one we +# can actually deliver, so an unknown-extension path survives in the body +# instead of vanishing. +# +# Covers images (inline), video (inline where supported), audio (voice/audio), +# documents/spreadsheets/presentations (send_document), archives, and rendered +# web output. The dispatch partition (image vs video vs document) lives in +# ``gateway/run.py``. +# --------------------------------------------------------------------------- + +MEDIA_DELIVERY_EXTS: Tuple[str, ...] = ( + # Images (embed inline) + ".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".svg", + # Video (embed inline where supported) + ".mp4", ".mov", ".avi", ".mkv", ".webm", + # Audio (delivered as voice/audio where supported) + ".mp3", ".wav", ".ogg", ".opus", ".m4a", ".flac", + # Documents (uploaded as file attachments) + ".pdf", ".docx", ".doc", ".odt", ".rtf", ".txt", ".md", ".epub", + # Spreadsheets / data + ".xlsx", ".xls", ".ods", ".csv", ".tsv", ".json", ".xml", ".yaml", ".yml", + # Presentations + ".pptx", ".ppt", ".odp", ".key", + # Archives + ".zip", ".tar", ".gz", ".tgz", ".bz2", ".xz", ".7z", ".rar", ".apk", ".ipa", + # Web / rendered output + ".html", ".htm", +) + +# Regex alternation fragment of bare extensions (no leading dot), e.g. +# ``png|jpe?g|...``. ``jpe?g`` collapses jpg/jpeg into one branch. Sorted +# longest-first so the alternation never matches a shorter ext as a prefix of +# a longer one (e.g. ``.tar`` before ``.tar.gz`` components). +_MEDIA_EXT_ALTERNATION = "|".join( + sorted((e.lstrip(".") for e in MEDIA_DELIVERY_EXTS), key=len, reverse=True) +) + +# Anchored ``MEDIA:`` cleanup pattern. Unlike the old loose +# ``MEDIA:\\s*\\S+``, this only strips a tag whose path ends in a known +# deliverable extension (optionally quoted/backticked). A ``MEDIA:`` tag with +# an unknown extension is left in the text so it can still be picked up by the +# bare-path detector (extract_local_files) downstream rather than silently +# deleted. Shared by the non-streaming dispatch path and the streaming +# consumer so both behave identically. +MEDIA_TAG_CLEANUP_RE = re.compile( + r'''[`"']?MEDIA:\s*''' + r'''(?P`[^`\n]+`|"[^"\n]+"|'[^'\n]+'|''' + r'''(?:~/|/)\S+(?:[^\S\n]+\S+)*?\.(?:''' + _MEDIA_EXT_ALTERNATION + r'''))''' + r'''(?=[\s`"',;:)\]}]|$)[`"']?''', + re.IGNORECASE, +) + + def get_document_cache_dir() -> Path: """Return the document cache directory, creating it if it doesn't exist.""" DOCUMENT_CACHE_DIR.mkdir(parents=True, exist_ok=True) @@ -2542,10 +2611,10 @@ class BasePlatformAdapter(ABC): cleaned = cleaned.replace("[[as_document]]", "") # Extract MEDIA: tags, allowing optional whitespace after the colon - # and quoted/backticked paths for LLM-formatted outputs. - media_pattern = re.compile( - r'''[`"']?MEDIA:\s*(?P`[^`\n]+`|"[^"\n]+"|'[^'\n]+'|(?:~/|/)\S+(?:[^\S\n]+\S+)*?\.(?:png|jpe?g|gif|webp|mp4|mov|avi|mkv|webm|ogg|opus|mp3|wav|m4a|flac|epub|pdf|zip|rar|7z|docx?|xlsx?|pptx?|txt|csv|apk|ipa)(?=[\s`"',;:)\]}]|$))[`"']?''' - ) + # and quoted/backticked paths for LLM-formatted outputs. The extension + # set is the shared MEDIA_DELIVERY_EXTS source of truth (built once into + # MEDIA_TAG_CLEANUP_RE) so it can never drift from extract_local_files. + media_pattern = MEDIA_TAG_CLEANUP_RE for match in media_pattern.finditer(content): path = match.group("path").strip() if len(path) >= 2 and path[0] == path[-1] and path[0] in "`\"'": @@ -2591,24 +2660,7 @@ class BasePlatformAdapter(ABC): Tuple of (list of expanded file paths, cleaned text with the raw path strings removed). """ - _LOCAL_MEDIA_EXTS = ( - # Images (embed inline) - '.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp', '.tiff', '.svg', - # Video (embed inline where supported) - '.mp4', '.mov', '.avi', '.mkv', '.webm', - # Audio (delivered as voice/audio where supported) - '.mp3', '.wav', '.ogg', '.m4a', '.flac', - # Documents (uploaded as file attachments) - '.pdf', '.docx', '.doc', '.odt', '.rtf', '.txt', '.md', - # Spreadsheets / data - '.xlsx', '.xls', '.ods', '.csv', '.tsv', '.json', '.xml', '.yaml', '.yml', - # Presentations - '.pptx', '.ppt', '.odp', '.key', - # Archives - '.zip', '.tar', '.gz', '.tgz', '.bz2', '.xz', '.7z', '.rar', - # Web / rendered output - '.html', '.htm', - ) + _LOCAL_MEDIA_EXTS = MEDIA_DELIVERY_EXTS ext_part = '|'.join(e.lstrip('.') for e in _LOCAL_MEDIA_EXTS) # (? tags (including optional surrounding quotes). - # Matches the simple cleanup regex used by the non-streaming path in - # gateway/platforms/base.py for post-processing. - _MEDIA_RE = re.compile(r'''[`"']?MEDIA:\s*\S+[`"']?''') + # Strip MEDIA: tags before display. Uses the shared anchored + # MEDIA_TAG_CLEANUP_RE from gateway/platforms/base.py — only tags whose + # path ends in a deliverable extension are removed, so an unknown-extension + # path stays visible instead of being silently dropped (issue #34517). + # Streaming and non-streaming paths share the same regex, so a tag is + # treated identically whichever path delivered the text. + _MEDIA_RE = MEDIA_TAG_CLEANUP_RE @staticmethod def _clean_for_display(text: str) -> str: diff --git a/scripts/release.py b/scripts/release.py index 7723eefdf..e9c11404d 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -525,6 +525,8 @@ AUTHOR_MAP = { "barnacleboy.jezzahehn@agentmail.to": "JezzaHehn", "254021826+dodo-reach@users.noreply.github.com": "dodo-reach", "259807879+Bartok9@users.noreply.github.com": "Bartok9", + "123342691+banditburai@users.noreply.github.com": "banditburai", + "9063726+Kyzcreig@users.noreply.github.com": "Kyzcreig", "270082434+crayfish-ai@users.noreply.github.com": "crayfish-ai", "241404605+MestreY0d4-Uninter@users.noreply.github.com": "MestreY0d4-Uninter", "268667990+Roy-oss1@users.noreply.github.com": "Roy-oss1", diff --git a/tests/gateway/test_platform_base.py b/tests/gateway/test_platform_base.py index 6a5b8c15c..e0f2c80cb 100644 --- a/tests/gateway/test_platform_base.py +++ b/tests/gateway/test_platform_base.py @@ -362,6 +362,54 @@ class TestExtractMedia: assert "[[as_document]]" not in cleaned +class TestMediaExtensionAllowlistParity: + """Regression coverage for issue #34517 — the MEDIA: extension black hole. + + extract_media used to carry a narrow extension allowlist that omitted + .md/.json/.yaml/.xml/.html etc., while extract_local_files had a broad one. + Combined with an unconditional ``MEDIA:\\s*\\S+`` strip at the dispatch + sites, an unmatched MEDIA: tag for one of those extensions was deleted from + the body before extract_local_files could pick up the bare path — the file + was silently dropped. Both extractors now derive from the single + MEDIA_DELIVERY_EXTS source of truth, and the strip is anchored to that set. + """ + + DROPPED_BEFORE = ["md", "json", "yaml", "yml", "xml", "html", "htm", + "tsv", "svg"] + + def test_previously_dropped_extensions_now_extract(self): + for ext in self.DROPPED_BEFORE: + path = f"/tmp/report.{ext}" + media, _ = BasePlatformAdapter.extract_media(f"Here: MEDIA:{path}") + assert media == [(path, False)], f".{ext} should extract via MEDIA:" + + def test_extract_media_and_local_files_share_one_extension_set(self): + from gateway.platforms.base import MEDIA_DELIVERY_EXTS + # Both functions reference MEDIA_DELIVERY_EXTS; assert the documents + # that motivated the bug are present in the shared set. + for ext in (".md", ".json", ".yaml", ".yml", ".xml", ".html", ".htm"): + assert ext in MEDIA_DELIVERY_EXTS + + def test_unknown_extension_not_black_holed_by_cleanup(self): + """A MEDIA: tag with an unknown extension is NOT stripped from the + body — it survives so extract_local_files can still see the bare path, + rather than vanishing entirely (the core of issue #34517).""" + from gateway.platforms.base import MEDIA_TAG_CLEANUP_RE + text = "Saved to MEDIA:/tmp/data.weirdext done" + media, _ = BasePlatformAdapter.extract_media(text) + assert media == [] # unknown extension is not a deliverable MEDIA tag + stripped = MEDIA_TAG_CLEANUP_RE.sub("", text) + assert "/tmp/data.weirdext" in stripped # path preserved, not dropped + + def test_known_extension_tag_is_stripped_from_body(self): + from gateway.platforms.base import MEDIA_TAG_CLEANUP_RE + text = "Here is your report: MEDIA:/tmp/report.md" + stripped = MEDIA_TAG_CLEANUP_RE.sub("", text).strip() + assert "MEDIA:" not in stripped + assert "/tmp/report.md" not in stripped + assert "Here is your report:" in stripped + + class TestMediaDeliveryPathValidation: def _patch_roots(self, monkeypatch, *roots): monkeypatch.setattr(