fix(gateway): unify MEDIA: extraction extension set + close the unknown-ext black hole (#34517) (#34844)

MEDIA:<path> tags for .md/.json/.yaml/.xml/.html and other document
extensions were silently dropped. extract_media() carried a narrow
extension allowlist that omitted them, while extract_local_files()
had a broad one. The dispatch sites then ran an unconditional
re.sub(r'MEDIA:\\s*\\S+', '') that stripped the tag from the body even
when extract_media had not matched it — so extract_local_files (broad
list) ran on text where the path was already gone, and the file was
delivered by neither path.

- Add MEDIA_DELIVERY_EXTS in gateway/platforms/base.py as the single
  source of truth; extract_media and extract_local_files both derive
  their extension set from it (no more drift).
- Replace the loose MEDIA cleanup at the non-streaming dispatch site
  (base.py) and the streaming consumer (stream_consumer.py) with the
  shared, extension-anchored MEDIA_TAG_CLEANUP_RE. A MEDIA: tag with an
  unknown extension is left in the body so the bare-path detector can
  still pick it up instead of being black-holed.
- Chain cleaned text through extract_media -> extract_images ->
  extract_local_files in run.py's post-stream media delivery (it was
  dropping the cleaned text and rescanning raw text with MEDIA: tags).
- Regression tests covering both halves: previously-dropped extensions
  now extract, and unknown-ext paths survive the cleanup.

Consolidates the MEDIA extension-allowlist PR cluster.

Co-authored-by: Bartok9 <259807879+Bartok9@users.noreply.github.com>
Co-authored-by: banditburai <123342691+banditburai@users.noreply.github.com>
Co-authored-by: Kyzcreig <9063726+Kyzcreig@users.noreply.github.com>
This commit is contained in:
Teknium
2026-05-29 13:24:01 -07:00
committed by GitHub
parent 0dc0c5ea6b
commit 781604ce4c
5 changed files with 147 additions and 29 deletions

View File

@ -1131,6 +1131,75 @@ SUPPORTED_IMAGE_DOCUMENT_TYPES = {
}
# ---------------------------------------------------------------------------
# Media-delivery extension allowlist — SINGLE SOURCE OF TRUTH
#
# Both extractors that turn response text into native attachments derive their
# extension set from this tuple:
# * ``extract_media()`` — explicit ``MEDIA:<path>`` tags
# * ``extract_local_files()`` — bare absolute/home paths the agent mentions
#
# Historically these two carried independently-maintained extension lists.
# ``extract_media`` had a narrow list (no .md/.json/.yaml/.xml/.html/...) while
# ``extract_local_files`` had a broad one. Combined with the unconditional
# ``MEDIA:\\s*\\S+`` cleanup at the dispatch sites, that mismatch created a
# silent black hole: a ``MEDIA:/report.md`` tag failed the narrow extract_media
# match, got stripped from the body by the loose cleanup regex, and was then
# invisible to extract_local_files — the file was never delivered (issue
# #34517). Keeping one list eliminates the drift; building the cleanup regexes
# from the same set means a tag is only stripped when its extension is one we
# can actually deliver, so an unknown-extension path survives in the body
# instead of vanishing.
#
# Covers images (inline), video (inline where supported), audio (voice/audio),
# documents/spreadsheets/presentations (send_document), archives, and rendered
# web output. The dispatch partition (image vs video vs document) lives in
# ``gateway/run.py``.
# ---------------------------------------------------------------------------
MEDIA_DELIVERY_EXTS: Tuple[str, ...] = (
# Images (embed inline)
".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".svg",
# Video (embed inline where supported)
".mp4", ".mov", ".avi", ".mkv", ".webm",
# Audio (delivered as voice/audio where supported)
".mp3", ".wav", ".ogg", ".opus", ".m4a", ".flac",
# Documents (uploaded as file attachments)
".pdf", ".docx", ".doc", ".odt", ".rtf", ".txt", ".md", ".epub",
# Spreadsheets / data
".xlsx", ".xls", ".ods", ".csv", ".tsv", ".json", ".xml", ".yaml", ".yml",
# Presentations
".pptx", ".ppt", ".odp", ".key",
# Archives
".zip", ".tar", ".gz", ".tgz", ".bz2", ".xz", ".7z", ".rar", ".apk", ".ipa",
# Web / rendered output
".html", ".htm",
)
# Regex alternation fragment of bare extensions (no leading dot), e.g.
# ``png|jpe?g|...``. ``jpe?g`` collapses jpg/jpeg into one branch. Sorted
# longest-first so the alternation never matches a shorter ext as a prefix of
# a longer one (e.g. ``.tar`` before ``.tar.gz`` components).
_MEDIA_EXT_ALTERNATION = "|".join(
sorted((e.lstrip(".") for e in MEDIA_DELIVERY_EXTS), key=len, reverse=True)
)
# Anchored ``MEDIA:<path>`` cleanup pattern. Unlike the old loose
# ``MEDIA:\\s*\\S+``, this only strips a tag whose path ends in a known
# deliverable extension (optionally quoted/backticked). A ``MEDIA:`` tag with
# an unknown extension is left in the text so it can still be picked up by the
# bare-path detector (extract_local_files) downstream rather than silently
# deleted. Shared by the non-streaming dispatch path and the streaming
# consumer so both behave identically.
MEDIA_TAG_CLEANUP_RE = re.compile(
r'''[`"']?MEDIA:\s*'''
r'''(?P<path>`[^`\n]+`|"[^"\n]+"|'[^'\n]+'|'''
r'''(?:~/|/)\S+(?:[^\S\n]+\S+)*?\.(?:''' + _MEDIA_EXT_ALTERNATION + r'''))'''
r'''(?=[\s`"',;:)\]}]|$)[`"']?''',
re.IGNORECASE,
)
def get_document_cache_dir() -> Path:
"""Return the document cache directory, creating it if it doesn't exist."""
DOCUMENT_CACHE_DIR.mkdir(parents=True, exist_ok=True)
@ -2542,10 +2611,10 @@ class BasePlatformAdapter(ABC):
cleaned = cleaned.replace("[[as_document]]", "")
# Extract MEDIA:<path> tags, allowing optional whitespace after the colon
# and quoted/backticked paths for LLM-formatted outputs.
media_pattern = re.compile(
r'''[`"']?MEDIA:\s*(?P<path>`[^`\n]+`|"[^"\n]+"|'[^'\n]+'|(?:~/|/)\S+(?:[^\S\n]+\S+)*?\.(?:png|jpe?g|gif|webp|mp4|mov|avi|mkv|webm|ogg|opus|mp3|wav|m4a|flac|epub|pdf|zip|rar|7z|docx?|xlsx?|pptx?|txt|csv|apk|ipa)(?=[\s`"',;:)\]}]|$))[`"']?'''
)
# and quoted/backticked paths for LLM-formatted outputs. The extension
# set is the shared MEDIA_DELIVERY_EXTS source of truth (built once into
# MEDIA_TAG_CLEANUP_RE) so it can never drift from extract_local_files.
media_pattern = MEDIA_TAG_CLEANUP_RE
for match in media_pattern.finditer(content):
path = match.group("path").strip()
if len(path) >= 2 and path[0] == path[-1] and path[0] in "`\"'":
@ -2591,24 +2660,7 @@ class BasePlatformAdapter(ABC):
Tuple of (list of expanded file paths, cleaned text with the
raw path strings removed).
"""
_LOCAL_MEDIA_EXTS = (
# Images (embed inline)
'.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp', '.tiff', '.svg',
# Video (embed inline where supported)
'.mp4', '.mov', '.avi', '.mkv', '.webm',
# Audio (delivered as voice/audio where supported)
'.mp3', '.wav', '.ogg', '.m4a', '.flac',
# Documents (uploaded as file attachments)
'.pdf', '.docx', '.doc', '.odt', '.rtf', '.txt', '.md',
# Spreadsheets / data
'.xlsx', '.xls', '.ods', '.csv', '.tsv', '.json', '.xml', '.yaml', '.yml',
# Presentations
'.pptx', '.ppt', '.odp', '.key',
# Archives
'.zip', '.tar', '.gz', '.tgz', '.bz2', '.xz', '.7z', '.rar',
# Web / rendered output
'.html', '.htm',
)
_LOCAL_MEDIA_EXTS = MEDIA_DELIVERY_EXTS
ext_part = '|'.join(e.lstrip('.') for e in _LOCAL_MEDIA_EXTS)
# (?<![/:\w.]) prevents matching inside URLs (e.g. https://…/img.png)
@ -3729,7 +3781,12 @@ class BasePlatformAdapter(ABC):
# Strip any remaining internal directives from message body (fixes #1561)
text_content = text_content.replace("[[audio_as_voice]]", "").strip()
text_content = text_content.replace("[[as_document]]", "").strip()
text_content = re.sub(r"MEDIA:\s*\S+", "", text_content).strip()
# Strip only MEDIA: tags whose path has a deliverable extension
# (shared MEDIA_TAG_CLEANUP_RE). A MEDIA: tag with an unknown
# extension is intentionally left in the body so extract_local_files
# below can still pick up the bare path — otherwise the file would
# be silently dropped (issue #34517).
text_content = MEDIA_TAG_CLEANUP_RE.sub("", text_content).strip()
if images:
logger.info("[%s] extract_images found %d image(s) in response (%d chars)", self.name, len(images), len(response))

View File

@ -11743,9 +11743,16 @@ class GatewayRunner:
from gateway.platforms.base import BasePlatformAdapter, should_send_media_as_audio
media_files, _ = adapter.extract_media(response)
media_files, cleaned = adapter.extract_media(response)
media_files = BasePlatformAdapter.filter_media_delivery_paths(media_files)
_, cleaned = adapter.extract_images(response)
# Chain the cleaned text through each extractor (extract_media →
# extract_images → extract_local_files) so MEDIA: tags and image URLs
# are removed before the bare-path auto-detect runs. Previously the
# cleaned text from extract_media was dropped (``_``) and
# extract_local_files scanned text that still contained MEDIA: tags,
# producing false-positive bare-path matches with the MEDIA: prefix
# glued on. This matches the chain order in gateway/platforms/base.py.
_, cleaned = adapter.extract_images(cleaned)
local_files, _ = adapter.extract_local_files(cleaned)
local_files = BasePlatformAdapter.filter_local_delivery_paths(local_files)

View File

@ -26,6 +26,7 @@ from typing import Any, Callable, Optional
from gateway.platforms.base import BasePlatformAdapter as _BasePlatformAdapter
from gateway.platforms.base import _custom_unit_to_cp
from gateway.platforms.base import MEDIA_TAG_CLEANUP_RE
from gateway.config import (
DEFAULT_STREAMING_EDIT_INTERVAL as _DEFAULT_STREAMING_EDIT_INTERVAL,
DEFAULT_STREAMING_BUFFER_THRESHOLD as _DEFAULT_STREAMING_BUFFER_THRESHOLD,
@ -645,10 +646,13 @@ class GatewayStreamConsumer:
except Exception as e:
logger.error("Stream consumer error: %s", e)
# Pattern to strip MEDIA:<path> tags (including optional surrounding quotes).
# Matches the simple cleanup regex used by the non-streaming path in
# gateway/platforms/base.py for post-processing.
_MEDIA_RE = re.compile(r'''[`"']?MEDIA:\s*\S+[`"']?''')
# Strip MEDIA:<path> tags before display. Uses the shared anchored
# MEDIA_TAG_CLEANUP_RE from gateway/platforms/base.py — only tags whose
# path ends in a deliverable extension are removed, so an unknown-extension
# path stays visible instead of being silently dropped (issue #34517).
# Streaming and non-streaming paths share the same regex, so a tag is
# treated identically whichever path delivered the text.
_MEDIA_RE = MEDIA_TAG_CLEANUP_RE
@staticmethod
def _clean_for_display(text: str) -> str:

View File

@ -525,6 +525,8 @@ AUTHOR_MAP = {
"barnacleboy.jezzahehn@agentmail.to": "JezzaHehn",
"254021826+dodo-reach@users.noreply.github.com": "dodo-reach",
"259807879+Bartok9@users.noreply.github.com": "Bartok9",
"123342691+banditburai@users.noreply.github.com": "banditburai",
"9063726+Kyzcreig@users.noreply.github.com": "Kyzcreig",
"270082434+crayfish-ai@users.noreply.github.com": "crayfish-ai",
"241404605+MestreY0d4-Uninter@users.noreply.github.com": "MestreY0d4-Uninter",
"268667990+Roy-oss1@users.noreply.github.com": "Roy-oss1",

View File

@ -362,6 +362,54 @@ class TestExtractMedia:
assert "[[as_document]]" not in cleaned
class TestMediaExtensionAllowlistParity:
"""Regression coverage for issue #34517 — the MEDIA: extension black hole.
extract_media used to carry a narrow extension allowlist that omitted
.md/.json/.yaml/.xml/.html etc., while extract_local_files had a broad one.
Combined with an unconditional ``MEDIA:\\s*\\S+`` strip at the dispatch
sites, an unmatched MEDIA: tag for one of those extensions was deleted from
the body before extract_local_files could pick up the bare path — the file
was silently dropped. Both extractors now derive from the single
MEDIA_DELIVERY_EXTS source of truth, and the strip is anchored to that set.
"""
DROPPED_BEFORE = ["md", "json", "yaml", "yml", "xml", "html", "htm",
"tsv", "svg"]
def test_previously_dropped_extensions_now_extract(self):
for ext in self.DROPPED_BEFORE:
path = f"/tmp/report.{ext}"
media, _ = BasePlatformAdapter.extract_media(f"Here: MEDIA:{path}")
assert media == [(path, False)], f".{ext} should extract via MEDIA:"
def test_extract_media_and_local_files_share_one_extension_set(self):
from gateway.platforms.base import MEDIA_DELIVERY_EXTS
# Both functions reference MEDIA_DELIVERY_EXTS; assert the documents
# that motivated the bug are present in the shared set.
for ext in (".md", ".json", ".yaml", ".yml", ".xml", ".html", ".htm"):
assert ext in MEDIA_DELIVERY_EXTS
def test_unknown_extension_not_black_holed_by_cleanup(self):
"""A MEDIA: tag with an unknown extension is NOT stripped from the
body — it survives so extract_local_files can still see the bare path,
rather than vanishing entirely (the core of issue #34517)."""
from gateway.platforms.base import MEDIA_TAG_CLEANUP_RE
text = "Saved to MEDIA:/tmp/data.weirdext done"
media, _ = BasePlatformAdapter.extract_media(text)
assert media == [] # unknown extension is not a deliverable MEDIA tag
stripped = MEDIA_TAG_CLEANUP_RE.sub("", text)
assert "/tmp/data.weirdext" in stripped # path preserved, not dropped
def test_known_extension_tag_is_stripped_from_body(self):
from gateway.platforms.base import MEDIA_TAG_CLEANUP_RE
text = "Here is your report: MEDIA:/tmp/report.md"
stripped = MEDIA_TAG_CLEANUP_RE.sub("", text).strip()
assert "MEDIA:" not in stripped
assert "/tmp/report.md" not in stripped
assert "Here is your report:" in stripped
class TestMediaDeliveryPathValidation:
def _patch_roots(self, monkeypatch, *roots):
monkeypatch.setattr(