From e28a668b40d3888fb69a624ef5fcb4dd59c9e5ff Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Fri, 29 May 2026 01:10:48 -0700 Subject: [PATCH] fix(gateway): diagnosable MEDIA rejections + canonical cache roots + null-path guard Operators can now see which MEDIA path was dropped and why, generated artifacts under the canonical ~/.hermes/cache/{images,...} layout deliver, and a crafted ~\x00 path no longer aborts the whole attachment batch. - MEDIA_DELIVERY_SAFE_ROOTS: add canonical cache/{images,audio,videos, documents,screenshots} alongside the legacy *_cache dirs (#31733). - filter_media/local_delivery_paths: log the rejected path (was a blind "outside allowed roots") via _log_safe_path, which strips control chars and Unicode line separators so a model-emitted path can't forge a log line. - validate_media_delivery_path + extract_media: guard os.path.expanduser so a ~\x00 path returns None / is skipped instead of raising and dropping every other attachment in the response. Salvaged and slimmed from #33251 (780 LOC -> 35): the reason-tag taxonomy, the parts-eliding redactor, and the extension-partition hoist are dropped in favor of logging the path directly. All three findings were verified and reproduced by the contributor. Co-authored-by: wysie --- gateway/platforms/base.py | 41 +++++++++++++++++++---- tests/gateway/test_platform_base.py | 50 +++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 6 deletions(-) diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py index 766f3541a..31af977c4 100644 --- a/gateway/platforms/base.py +++ b/gateway/platforms/base.py @@ -848,6 +848,13 @@ MEDIA_DELIVERY_SAFE_ROOTS = ( _HERMES_HOME / "video_cache", _HERMES_HOME / "document_cache", _HERMES_HOME / "browser_screenshots", + # Canonical cache layout — listed alongside the legacy *_cache dirs so + # generated artifacts deliver on installs that have both (#31733). + _HERMES_HOME / "cache" / "images", + _HERMES_HOME / "cache" / "audio", + _HERMES_HOME / "cache" / "videos", + _HERMES_HOME / "cache" / "documents", + _HERMES_HOME / "cache" / "screenshots", ) # Default recency window for trusting freshly-produced files (seconds). @@ -1022,7 +1029,11 @@ def validate_media_delivery_path(path: str) -> Optional[str]: if not candidate: return None - expanded = Path(os.path.expanduser(candidate)) + try: + expanded = Path(os.path.expanduser(candidate)) + except (OSError, RuntimeError, ValueError): + # expanduser raises ValueError("embedded null byte") for a ~\x00 path. + return None if not expanded.is_absolute(): return None @@ -1066,6 +1077,17 @@ def validate_media_delivery_path(path: str) -> Optional[str]: return None +# Neutralise control chars and the Unicode line separators (NEL, LS, PS) that +# str.splitlines() / log aggregators treat as breaks, so a model-emitted path +# can't forge a second log line. Truncated to keep records bounded. +_LOG_UNSAFE_CHARS = re.compile(r"[\x00-\x1f\x7f\x85\u2028\u2029]") + + +def _log_safe_path(path: str) -> str: + """Return a single-line, length-bounded path for log output.""" + return _LOG_UNSAFE_CHARS.sub("?", str(path))[:200] + + SUPPORTED_DOCUMENT_TYPES = { ".pdf": "application/pdf", ".md": "text/markdown", @@ -2438,11 +2460,12 @@ class BasePlatformAdapter(ABC): """Drop unsafe MEDIA paths and normalize accepted paths.""" safe_media: List[Tuple[str, bool]] = [] for media_path, is_voice in media_files or []: - safe_path = validate_media_delivery_path(str(media_path)) + raw = str(media_path) + safe_path = validate_media_delivery_path(raw) if safe_path: safe_media.append((safe_path, bool(is_voice))) else: - logger.warning("Skipping unsafe MEDIA directive path outside allowed roots") + logger.warning("Skipping unsafe MEDIA directive path: %s", _log_safe_path(raw)) return safe_media @staticmethod @@ -2450,11 +2473,12 @@ class BasePlatformAdapter(ABC): """Drop unsafe bare local file paths and normalize accepted paths.""" safe_paths: List[str] = [] for file_path in file_paths or []: - safe_path = validate_media_delivery_path(str(file_path)) + raw = str(file_path) + safe_path = validate_media_delivery_path(raw) if safe_path: safe_paths.append(safe_path) else: - logger.warning("Skipping unsafe local file path outside allowed roots") + logger.warning("Skipping unsafe local file path: %s", _log_safe_path(raw)) return safe_paths @staticmethod @@ -2505,7 +2529,12 @@ class BasePlatformAdapter(ABC): path = path[1:-1].strip() path = path.lstrip("`\"'").rstrip("`\"',.;:)}]") if path: - media.append((os.path.expanduser(path), has_voice_tag)) + try: + media.append((os.path.expanduser(path), has_voice_tag)) + except (OSError, RuntimeError, ValueError): + # Skip a crafted ~\x00 path rather than aborting extraction + # and dropping every other attachment in the response. + continue # Remove MEDIA tags from content (including surrounding quote/backtick wrappers) if media: diff --git a/tests/gateway/test_platform_base.py b/tests/gateway/test_platform_base.py index 34d94c06f..6a5b8c15c 100644 --- a/tests/gateway/test_platform_base.py +++ b/tests/gateway/test_platform_base.py @@ -12,6 +12,7 @@ from gateway.platforms.base import ( MessageEvent, safe_url_for_log, utf16_len, + _log_safe_path, _prefix_within_utf16_limit, ) @@ -1050,3 +1051,52 @@ class TestProxyKwargsForAiohttp: sess_kw, req_kw = proxy_kwargs_for_aiohttp("http://proxy:8080") assert sess_kw == {} assert req_kw == {"proxy": "http://proxy:8080"} + + +class TestMediaDeliveryDiagnosability: + """Diagnosable rejection logging + crafted-path robustness (#33251).""" + + def test_rejected_path_appears_in_log(self, tmp_path, caplog): + outside = tmp_path / "outside.ogg" + outside.write_bytes(b"OggS") + with patch.dict(os.environ, {"HERMES_MEDIA_DELIVERY_STRICT": "1", + "HERMES_MEDIA_TRUST_RECENT_FILES": "0"}), \ + patch("gateway.platforms.base.MEDIA_DELIVERY_SAFE_ROOTS", ()): + with caplog.at_level("WARNING"): + out = BasePlatformAdapter.filter_media_delivery_paths([(str(outside), False)]) + assert out == [] + # The dropped path must be in the log so operators can diagnose it. + assert str(outside) in caplog.text + + def test_crafted_null_path_does_not_abort_batch(self, tmp_path, monkeypatch): + """One crafted ~\\x00 path must not drop every other attachment.""" + good = tmp_path / "good.png" + good.write_bytes(b"\x89PNG") + monkeypatch.setenv("HERMES_MEDIA_DELIVERY_STRICT", "0") + out = BasePlatformAdapter.filter_media_delivery_paths([ + ("~\x00evil.png", False), + (str(good), False), + ]) + assert out == [(str(good.resolve()), False)] + + def test_extract_media_tolerates_crafted_null_path(self): + """extract_media must not raise on a crafted ~\\x00 MEDIA tag.""" + content = "here\nMEDIA:`~\x00evil.png`\ntrailing" + # Must not raise ValueError("embedded null byte"). + media, cleaned = BasePlatformAdapter.extract_media(content) + assert all("\x00" not in p for p, _ in media) + + def test_log_safe_path_neutralises_line_breaks(self): + forged = "/tmp/a.png\nWARNING forged second line" + assert "\n" not in _log_safe_path(forged) + # Unicode separators that split log lines are also neutralised. + for sep in ("\u2028", "\u2029", "\x85"): + assert sep not in _log_safe_path(f"/tmp/a{sep}b.png") + + def test_canonical_cache_roots_present(self): + from gateway.platforms.base import MEDIA_DELIVERY_SAFE_ROOTS + roots = {str(r) for r in MEDIA_DELIVERY_SAFE_ROOTS} + assert any(r.endswith("cache/images") for r in roots) + assert any(r.endswith("cache/documents") for r in roots) + # Legacy layout still present. + assert any(r.endswith("image_cache") for r in roots)