fix(gateway): unify MEDIA: extraction extension set + close the unknown-ext black hole (#34517) (#34844)
MEDIA:<path> tags for .md/.json/.yaml/.xml/.html and other document extensions were silently dropped. extract_media() carried a narrow extension allowlist that omitted them, while extract_local_files() had a broad one. The dispatch sites then ran an unconditional re.sub(r'MEDIA:\\s*\\S+', '') that stripped the tag from the body even when extract_media had not matched it — so extract_local_files (broad list) ran on text where the path was already gone, and the file was delivered by neither path. - Add MEDIA_DELIVERY_EXTS in gateway/platforms/base.py as the single source of truth; extract_media and extract_local_files both derive their extension set from it (no more drift). - Replace the loose MEDIA cleanup at the non-streaming dispatch site (base.py) and the streaming consumer (stream_consumer.py) with the shared, extension-anchored MEDIA_TAG_CLEANUP_RE. A MEDIA: tag with an unknown extension is left in the body so the bare-path detector can still pick it up instead of being black-holed. - Chain cleaned text through extract_media -> extract_images -> extract_local_files in run.py's post-stream media delivery (it was dropping the cleaned text and rescanning raw text with MEDIA: tags). - Regression tests covering both halves: previously-dropped extensions now extract, and unknown-ext paths survive the cleanup. Consolidates the MEDIA extension-allowlist PR cluster. Co-authored-by: Bartok9 <259807879+Bartok9@users.noreply.github.com> Co-authored-by: banditburai <123342691+banditburai@users.noreply.github.com> Co-authored-by: Kyzcreig <9063726+Kyzcreig@users.noreply.github.com>
This commit is contained in:
@ -1131,6 +1131,75 @@ SUPPORTED_IMAGE_DOCUMENT_TYPES = {
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Media-delivery extension allowlist — SINGLE SOURCE OF TRUTH
|
||||
#
|
||||
# Both extractors that turn response text into native attachments derive their
|
||||
# extension set from this tuple:
|
||||
# * ``extract_media()`` — explicit ``MEDIA:<path>`` tags
|
||||
# * ``extract_local_files()`` — bare absolute/home paths the agent mentions
|
||||
#
|
||||
# Historically these two carried independently-maintained extension lists.
|
||||
# ``extract_media`` had a narrow list (no .md/.json/.yaml/.xml/.html/...) while
|
||||
# ``extract_local_files`` had a broad one. Combined with the unconditional
|
||||
# ``MEDIA:\\s*\\S+`` cleanup at the dispatch sites, that mismatch created a
|
||||
# silent black hole: a ``MEDIA:/report.md`` tag failed the narrow extract_media
|
||||
# match, got stripped from the body by the loose cleanup regex, and was then
|
||||
# invisible to extract_local_files — the file was never delivered (issue
|
||||
# #34517). Keeping one list eliminates the drift; building the cleanup regexes
|
||||
# from the same set means a tag is only stripped when its extension is one we
|
||||
# can actually deliver, so an unknown-extension path survives in the body
|
||||
# instead of vanishing.
|
||||
#
|
||||
# Covers images (inline), video (inline where supported), audio (voice/audio),
|
||||
# documents/spreadsheets/presentations (send_document), archives, and rendered
|
||||
# web output. The dispatch partition (image vs video vs document) lives in
|
||||
# ``gateway/run.py``.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
MEDIA_DELIVERY_EXTS: Tuple[str, ...] = (
|
||||
# Images (embed inline)
|
||||
".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".svg",
|
||||
# Video (embed inline where supported)
|
||||
".mp4", ".mov", ".avi", ".mkv", ".webm",
|
||||
# Audio (delivered as voice/audio where supported)
|
||||
".mp3", ".wav", ".ogg", ".opus", ".m4a", ".flac",
|
||||
# Documents (uploaded as file attachments)
|
||||
".pdf", ".docx", ".doc", ".odt", ".rtf", ".txt", ".md", ".epub",
|
||||
# Spreadsheets / data
|
||||
".xlsx", ".xls", ".ods", ".csv", ".tsv", ".json", ".xml", ".yaml", ".yml",
|
||||
# Presentations
|
||||
".pptx", ".ppt", ".odp", ".key",
|
||||
# Archives
|
||||
".zip", ".tar", ".gz", ".tgz", ".bz2", ".xz", ".7z", ".rar", ".apk", ".ipa",
|
||||
# Web / rendered output
|
||||
".html", ".htm",
|
||||
)
|
||||
|
||||
# Regex alternation fragment of bare extensions (no leading dot), e.g.
|
||||
# ``png|jpe?g|...``. ``jpe?g`` collapses jpg/jpeg into one branch. Sorted
|
||||
# longest-first so the alternation never matches a shorter ext as a prefix of
|
||||
# a longer one (e.g. ``.tar`` before ``.tar.gz`` components).
|
||||
_MEDIA_EXT_ALTERNATION = "|".join(
|
||||
sorted((e.lstrip(".") for e in MEDIA_DELIVERY_EXTS), key=len, reverse=True)
|
||||
)
|
||||
|
||||
# Anchored ``MEDIA:<path>`` cleanup pattern. Unlike the old loose
|
||||
# ``MEDIA:\\s*\\S+``, this only strips a tag whose path ends in a known
|
||||
# deliverable extension (optionally quoted/backticked). A ``MEDIA:`` tag with
|
||||
# an unknown extension is left in the text so it can still be picked up by the
|
||||
# bare-path detector (extract_local_files) downstream rather than silently
|
||||
# deleted. Shared by the non-streaming dispatch path and the streaming
|
||||
# consumer so both behave identically.
|
||||
MEDIA_TAG_CLEANUP_RE = re.compile(
|
||||
r'''[`"']?MEDIA:\s*'''
|
||||
r'''(?P<path>`[^`\n]+`|"[^"\n]+"|'[^'\n]+'|'''
|
||||
r'''(?:~/|/)\S+(?:[^\S\n]+\S+)*?\.(?:''' + _MEDIA_EXT_ALTERNATION + r'''))'''
|
||||
r'''(?=[\s`"',;:)\]}]|$)[`"']?''',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def get_document_cache_dir() -> Path:
|
||||
"""Return the document cache directory, creating it if it doesn't exist."""
|
||||
DOCUMENT_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
@ -2542,10 +2611,10 @@ class BasePlatformAdapter(ABC):
|
||||
cleaned = cleaned.replace("[[as_document]]", "")
|
||||
|
||||
# Extract MEDIA:<path> tags, allowing optional whitespace after the colon
|
||||
# and quoted/backticked paths for LLM-formatted outputs.
|
||||
media_pattern = re.compile(
|
||||
r'''[`"']?MEDIA:\s*(?P<path>`[^`\n]+`|"[^"\n]+"|'[^'\n]+'|(?:~/|/)\S+(?:[^\S\n]+\S+)*?\.(?:png|jpe?g|gif|webp|mp4|mov|avi|mkv|webm|ogg|opus|mp3|wav|m4a|flac|epub|pdf|zip|rar|7z|docx?|xlsx?|pptx?|txt|csv|apk|ipa)(?=[\s`"',;:)\]}]|$))[`"']?'''
|
||||
)
|
||||
# and quoted/backticked paths for LLM-formatted outputs. The extension
|
||||
# set is the shared MEDIA_DELIVERY_EXTS source of truth (built once into
|
||||
# MEDIA_TAG_CLEANUP_RE) so it can never drift from extract_local_files.
|
||||
media_pattern = MEDIA_TAG_CLEANUP_RE
|
||||
for match in media_pattern.finditer(content):
|
||||
path = match.group("path").strip()
|
||||
if len(path) >= 2 and path[0] == path[-1] and path[0] in "`\"'":
|
||||
@ -2591,24 +2660,7 @@ class BasePlatformAdapter(ABC):
|
||||
Tuple of (list of expanded file paths, cleaned text with the
|
||||
raw path strings removed).
|
||||
"""
|
||||
_LOCAL_MEDIA_EXTS = (
|
||||
# Images (embed inline)
|
||||
'.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp', '.tiff', '.svg',
|
||||
# Video (embed inline where supported)
|
||||
'.mp4', '.mov', '.avi', '.mkv', '.webm',
|
||||
# Audio (delivered as voice/audio where supported)
|
||||
'.mp3', '.wav', '.ogg', '.m4a', '.flac',
|
||||
# Documents (uploaded as file attachments)
|
||||
'.pdf', '.docx', '.doc', '.odt', '.rtf', '.txt', '.md',
|
||||
# Spreadsheets / data
|
||||
'.xlsx', '.xls', '.ods', '.csv', '.tsv', '.json', '.xml', '.yaml', '.yml',
|
||||
# Presentations
|
||||
'.pptx', '.ppt', '.odp', '.key',
|
||||
# Archives
|
||||
'.zip', '.tar', '.gz', '.tgz', '.bz2', '.xz', '.7z', '.rar',
|
||||
# Web / rendered output
|
||||
'.html', '.htm',
|
||||
)
|
||||
_LOCAL_MEDIA_EXTS = MEDIA_DELIVERY_EXTS
|
||||
ext_part = '|'.join(e.lstrip('.') for e in _LOCAL_MEDIA_EXTS)
|
||||
|
||||
# (?<![/:\w.]) prevents matching inside URLs (e.g. https://…/img.png)
|
||||
@ -3729,7 +3781,12 @@ class BasePlatformAdapter(ABC):
|
||||
# Strip any remaining internal directives from message body (fixes #1561)
|
||||
text_content = text_content.replace("[[audio_as_voice]]", "").strip()
|
||||
text_content = text_content.replace("[[as_document]]", "").strip()
|
||||
text_content = re.sub(r"MEDIA:\s*\S+", "", text_content).strip()
|
||||
# Strip only MEDIA: tags whose path has a deliverable extension
|
||||
# (shared MEDIA_TAG_CLEANUP_RE). A MEDIA: tag with an unknown
|
||||
# extension is intentionally left in the body so extract_local_files
|
||||
# below can still pick up the bare path — otherwise the file would
|
||||
# be silently dropped (issue #34517).
|
||||
text_content = MEDIA_TAG_CLEANUP_RE.sub("", text_content).strip()
|
||||
if images:
|
||||
logger.info("[%s] extract_images found %d image(s) in response (%d chars)", self.name, len(images), len(response))
|
||||
|
||||
|
||||
@ -11743,9 +11743,16 @@ class GatewayRunner:
|
||||
|
||||
from gateway.platforms.base import BasePlatformAdapter, should_send_media_as_audio
|
||||
|
||||
media_files, _ = adapter.extract_media(response)
|
||||
media_files, cleaned = adapter.extract_media(response)
|
||||
media_files = BasePlatformAdapter.filter_media_delivery_paths(media_files)
|
||||
_, cleaned = adapter.extract_images(response)
|
||||
# Chain the cleaned text through each extractor (extract_media →
|
||||
# extract_images → extract_local_files) so MEDIA: tags and image URLs
|
||||
# are removed before the bare-path auto-detect runs. Previously the
|
||||
# cleaned text from extract_media was dropped (``_``) and
|
||||
# extract_local_files scanned text that still contained MEDIA: tags,
|
||||
# producing false-positive bare-path matches with the MEDIA: prefix
|
||||
# glued on. This matches the chain order in gateway/platforms/base.py.
|
||||
_, cleaned = adapter.extract_images(cleaned)
|
||||
local_files, _ = adapter.extract_local_files(cleaned)
|
||||
local_files = BasePlatformAdapter.filter_local_delivery_paths(local_files)
|
||||
|
||||
|
||||
@ -26,6 +26,7 @@ from typing import Any, Callable, Optional
|
||||
|
||||
from gateway.platforms.base import BasePlatformAdapter as _BasePlatformAdapter
|
||||
from gateway.platforms.base import _custom_unit_to_cp
|
||||
from gateway.platforms.base import MEDIA_TAG_CLEANUP_RE
|
||||
from gateway.config import (
|
||||
DEFAULT_STREAMING_EDIT_INTERVAL as _DEFAULT_STREAMING_EDIT_INTERVAL,
|
||||
DEFAULT_STREAMING_BUFFER_THRESHOLD as _DEFAULT_STREAMING_BUFFER_THRESHOLD,
|
||||
@ -645,10 +646,13 @@ class GatewayStreamConsumer:
|
||||
except Exception as e:
|
||||
logger.error("Stream consumer error: %s", e)
|
||||
|
||||
# Pattern to strip MEDIA:<path> tags (including optional surrounding quotes).
|
||||
# Matches the simple cleanup regex used by the non-streaming path in
|
||||
# gateway/platforms/base.py for post-processing.
|
||||
_MEDIA_RE = re.compile(r'''[`"']?MEDIA:\s*\S+[`"']?''')
|
||||
# Strip MEDIA:<path> tags before display. Uses the shared anchored
|
||||
# MEDIA_TAG_CLEANUP_RE from gateway/platforms/base.py — only tags whose
|
||||
# path ends in a deliverable extension are removed, so an unknown-extension
|
||||
# path stays visible instead of being silently dropped (issue #34517).
|
||||
# Streaming and non-streaming paths share the same regex, so a tag is
|
||||
# treated identically whichever path delivered the text.
|
||||
_MEDIA_RE = MEDIA_TAG_CLEANUP_RE
|
||||
|
||||
@staticmethod
|
||||
def _clean_for_display(text: str) -> str:
|
||||
|
||||
@ -525,6 +525,8 @@ AUTHOR_MAP = {
|
||||
"barnacleboy.jezzahehn@agentmail.to": "JezzaHehn",
|
||||
"254021826+dodo-reach@users.noreply.github.com": "dodo-reach",
|
||||
"259807879+Bartok9@users.noreply.github.com": "Bartok9",
|
||||
"123342691+banditburai@users.noreply.github.com": "banditburai",
|
||||
"9063726+Kyzcreig@users.noreply.github.com": "Kyzcreig",
|
||||
"270082434+crayfish-ai@users.noreply.github.com": "crayfish-ai",
|
||||
"241404605+MestreY0d4-Uninter@users.noreply.github.com": "MestreY0d4-Uninter",
|
||||
"268667990+Roy-oss1@users.noreply.github.com": "Roy-oss1",
|
||||
|
||||
@ -362,6 +362,54 @@ class TestExtractMedia:
|
||||
assert "[[as_document]]" not in cleaned
|
||||
|
||||
|
||||
class TestMediaExtensionAllowlistParity:
|
||||
"""Regression coverage for issue #34517 — the MEDIA: extension black hole.
|
||||
|
||||
extract_media used to carry a narrow extension allowlist that omitted
|
||||
.md/.json/.yaml/.xml/.html etc., while extract_local_files had a broad one.
|
||||
Combined with an unconditional ``MEDIA:\\s*\\S+`` strip at the dispatch
|
||||
sites, an unmatched MEDIA: tag for one of those extensions was deleted from
|
||||
the body before extract_local_files could pick up the bare path — the file
|
||||
was silently dropped. Both extractors now derive from the single
|
||||
MEDIA_DELIVERY_EXTS source of truth, and the strip is anchored to that set.
|
||||
"""
|
||||
|
||||
DROPPED_BEFORE = ["md", "json", "yaml", "yml", "xml", "html", "htm",
|
||||
"tsv", "svg"]
|
||||
|
||||
def test_previously_dropped_extensions_now_extract(self):
|
||||
for ext in self.DROPPED_BEFORE:
|
||||
path = f"/tmp/report.{ext}"
|
||||
media, _ = BasePlatformAdapter.extract_media(f"Here: MEDIA:{path}")
|
||||
assert media == [(path, False)], f".{ext} should extract via MEDIA:"
|
||||
|
||||
def test_extract_media_and_local_files_share_one_extension_set(self):
|
||||
from gateway.platforms.base import MEDIA_DELIVERY_EXTS
|
||||
# Both functions reference MEDIA_DELIVERY_EXTS; assert the documents
|
||||
# that motivated the bug are present in the shared set.
|
||||
for ext in (".md", ".json", ".yaml", ".yml", ".xml", ".html", ".htm"):
|
||||
assert ext in MEDIA_DELIVERY_EXTS
|
||||
|
||||
def test_unknown_extension_not_black_holed_by_cleanup(self):
|
||||
"""A MEDIA: tag with an unknown extension is NOT stripped from the
|
||||
body — it survives so extract_local_files can still see the bare path,
|
||||
rather than vanishing entirely (the core of issue #34517)."""
|
||||
from gateway.platforms.base import MEDIA_TAG_CLEANUP_RE
|
||||
text = "Saved to MEDIA:/tmp/data.weirdext done"
|
||||
media, _ = BasePlatformAdapter.extract_media(text)
|
||||
assert media == [] # unknown extension is not a deliverable MEDIA tag
|
||||
stripped = MEDIA_TAG_CLEANUP_RE.sub("", text)
|
||||
assert "/tmp/data.weirdext" in stripped # path preserved, not dropped
|
||||
|
||||
def test_known_extension_tag_is_stripped_from_body(self):
|
||||
from gateway.platforms.base import MEDIA_TAG_CLEANUP_RE
|
||||
text = "Here is your report: MEDIA:/tmp/report.md"
|
||||
stripped = MEDIA_TAG_CLEANUP_RE.sub("", text).strip()
|
||||
assert "MEDIA:" not in stripped
|
||||
assert "/tmp/report.md" not in stripped
|
||||
assert "Here is your report:" in stripped
|
||||
|
||||
|
||||
class TestMediaDeliveryPathValidation:
|
||||
def _patch_roots(self, monkeypatch, *roots):
|
||||
monkeypatch.setattr(
|
||||
|
||||
Reference in New Issue
Block a user