fix(gateway): restrict auto-appended media to producer tools

This commit is contained in:
VinciZhu
2026-04-27 20:28:02 -04:00
committed by kshitij
parent fb1b681b3b
commit 521d06975e
2 changed files with 145 additions and 30 deletions

View File

@ -669,6 +669,88 @@ def _last_transcript_timestamp(history: Optional[List[Dict[str, Any]]]) -> Any:
return None
# Tool results can contain literal MEDIA: examples in docs, logs, or other
# ordinary outputs. Only tools that intentionally create deliverable media
# artifacts should be eligible for automatic append when the model omits them
# from the final gateway reply.
_AUTO_APPEND_MEDIA_TOOL_NAMES = {"text_to_speech", "text_to_speech_tool"}
# Extension-anchored MEDIA: matcher for tool results. Mirrors the dispatch-site
# pattern so a bare ``MEDIA:`` token in prose (no deliverable extension) is never
# auto-appended. Kept local to the auto-append path; the producer-tool allowlist
# below is the primary guard, this is the secondary precision guard.
_TOOL_MEDIA_RE = re.compile(
r'MEDIA:((?:[A-Za-z]:[/\\]|/|~\/)\S+\.(?:png|jpe?g|gif|webp|'
r'mp4|mov|avi|mkv|webm|ogg|opus|mp3|wav|m4a|'
r'flac|epub|pdf|zip|rar|7z|docx?|xlsx?|pptx?|'
r'txt|csv|apk|ipa))',
re.IGNORECASE,
)
def _collect_auto_append_media_tags(
messages: List[Dict[str, Any]],
history_offset: int = 0,
history_media_paths: Optional[set] = None,
) -> tuple[List[str], bool]:
"""Collect real media tags from current-turn producer-tool results only.
Two layered guards keep stale/example MEDIA: strings out of the reply:
1. Producer-tool allowlist: only tools that intentionally emit deliverable
artifacts (TTS) are eligible. Documentation, logs, and search results can
contain example strings such as MEDIA:/absolute/path/to/file, which must
never be delivered as attachments. (Fixes the original report behind #16721.)
2. Current-turn isolation: only messages produced this turn are scanned, so a
tool result from an earlier turn (still present in the full message list)
cannot leak onto a later text-only reply (#34608).
Mid-run context compression can rewrite/shrink the message list below the
original history length. When that happens the slice boundary is no longer
trustworthy, so fall back to scanning every message and rely on
``history_media_paths`` for dedup, preserving the compression-safe behaviour
of #160. The producer-tool allowlist still applies on the fallback path.
"""
history_media_paths = history_media_paths or set()
# Only trust the slice boundary when the message list still contains the
# full history prefix. Otherwise scan everything (compression-safe fallback).
if history_offset and len(messages) >= history_offset:
new_messages = messages[history_offset:]
else:
new_messages = messages
tool_name_by_call_id: Dict[str, str] = {}
for msg in new_messages:
if msg.get("role") != "assistant":
continue
for call in msg.get("tool_calls") or []:
call_id = call.get("id") or call.get("call_id")
fn = call.get("function") or {}
name = str(fn.get("name") or call.get("name") or "")
if call_id and name:
tool_name_by_call_id[str(call_id)] = name
media_tags: List[str] = []
has_voice_directive = False
for msg in new_messages:
if msg.get("role") not in ("tool", "function"):
continue
call_id = str(msg.get("tool_call_id") or msg.get("call_id") or "")
if tool_name_by_call_id.get(call_id) not in _AUTO_APPEND_MEDIA_TOOL_NAMES:
continue
content = str(msg.get("content") or "")
if "MEDIA:" not in content:
continue
for match in _TOOL_MEDIA_RE.finditer(content):
path = match.group(1).strip().rstrip('\",}')
if path and path not in history_media_paths:
media_tags.append(f"MEDIA:{path}")
if "[[audio_as_voice]]" in content:
has_voice_directive = True
return media_tags, has_voice_directive
# ---------------------------------------------------------------------------
# SSL certificate auto-detection for NixOS and other non-standard systems.
# Must run BEFORE any HTTP library (discord, aiohttp, etc.) is imported.
@ -17931,36 +18013,12 @@ class GatewayRunner:
# context compression shrinks the message list below the original
# history length, preserving the compression-safe behaviour of #160.
if "MEDIA:" not in final_response:
media_tags = []
has_voice_directive = False
_all_msgs = result.get("messages", [])
_history_len = len(agent_history)
# Only trust the slice boundary when the message list still
# contains the full history prefix. Mid-run compression can
# rewrite/shrink the list; in that case fall back to scanning
# everything and rely on _history_media_paths for dedup.
if _history_len and len(_all_msgs) >= _history_len:
_scan_msgs = _all_msgs[_history_len:]
else:
_scan_msgs = _all_msgs
for msg in _scan_msgs:
if msg.get("role") in {"tool", "function"}:
content = msg.get("content", "")
if "MEDIA:" in content:
_TOOL_MEDIA_RE = re.compile(
r'MEDIA:((?:[A-Za-z]:[/\\]|/|~\/)\S+\.(?:png|jpe?g|gif|webp|'
r'mp4|mov|avi|mkv|webm|ogg|opus|mp3|wav|m4a|'
r'flac|epub|pdf|zip|rar|7z|docx?|xlsx?|pptx?|'
r'txt|csv|apk|ipa))',
re.IGNORECASE
)
for match in _TOOL_MEDIA_RE.finditer(content):
path = match.group(1).strip().rstrip('",}')
if path and path not in _history_media_paths:
media_tags.append(f"MEDIA:{path}")
if "[[audio_as_voice]]" in content:
has_voice_directive = True
media_tags, has_voice_directive = _collect_auto_append_media_tags(
result.get("messages", []),
history_offset=len(agent_history),
history_media_paths=_history_media_paths,
)
if media_tags:
seen = set()
unique_tags = []

View File

@ -102,6 +102,63 @@ def extract_media_tags_broken(result_messages):
class TestMediaExtraction:
"""Tests for MEDIA tag extraction from tool results."""
def test_gateway_auto_append_ignores_media_examples_in_skill_docs(self):
"""Skill/documentation examples must not be appended as real attachments."""
from gateway.run import _collect_auto_append_media_tags
messages = [
{"role": "user", "content": "How should I format gateway media?"},
{
"role": "assistant",
"tool_calls": [
{"id": "call_skill", "function": {"name": "skill_view"}}
],
},
{
"role": "tool",
"tool_call_id": "call_skill",
"content": """
Recommended pattern:
```text
MEDIA:/absolute/path/to/image.png
```
Second message:
```text
caption
```
""",
},
{"role": "assistant", "content": "Use a standalone media message."},
]
tags, voice = _collect_auto_append_media_tags(messages, history_offset=0)
assert tags == []
assert voice is False
def test_gateway_auto_append_keeps_real_tts_media_tag(self):
"""TTS tool media tags are still auto-appended when the model omits them."""
from gateway.run import _collect_auto_append_media_tags
messages = [
{"role": "user", "content": "Say this as audio"},
{
"role": "assistant",
"tool_calls": [
{"id": "call_tts", "function": {"name": "text_to_speech"}}
],
},
{
"role": "tool",
"tool_call_id": "call_tts",
"content": '{"success": true, "media_tag": "[[audio_as_voice]]\\nMEDIA:/tmp/voice.ogg"}',
},
{"role": "assistant", "content": "Done."},
]
tags, voice = _collect_auto_append_media_tags(messages, history_offset=0)
assert tags == ["MEDIA:/tmp/voice.ogg"]
assert voice is True
def test_media_tags_not_extracted_from_history(self):
"""MEDIA tags from previous turns should NOT be extracted again."""