Files
hermes-agent/tests/hermes_cli/test_kanban_worker_image_extraction.py
Teknium 769ee86cd2 feat(kanban): attach images referenced in task bodies to worker vision (#34210)
Kanban workers now scan the task body for local image paths and
http(s) image URLs and attach them to the worker's first user turn —
matching the CLI/gateway behaviour for inbound images. Before, a
user pasting `/home/me/screenshot.png` or `https://example.com/img.png`
into a kanban task description had it sent to the model as plain
text and the pixels were never seen.

How it works:
* agent/image_routing.py gains extract_image_refs(text) → (paths, urls)
  that mirrors gateway/platforms/base.py:extract_local_files (absolute /
  ~-relative paths, image extensions only, ignores fenced/inline code).
* build_native_content_parts() accepts an optional image_urls= kwarg
  and emits passthrough image_url parts for remote URLs alongside the
  base64 data: URLs used for local paths.
* cli.py (single-query/quiet branch — the path every dispatcher-spawned
  worker takes) detects HERMES_KANBAN_TASK, reads the task body via
  kanban_db.get_task, runs extract_image_refs, and threads the results
  into the existing image-routing decision (native vs text). Best-effort:
  enrichment failures never block worker startup.

Tested:
* tests/agent/test_image_routing.py — 22 new tests for extract_image_refs
  and URL pass-through in build_native_content_parts.
* tests/hermes_cli/test_kanban_worker_image_extraction.py — 10 new tests
  driving real kanban_db round-trip (create task → read body → extract
  refs → build parts).
* E2E: created a fake kanban task with a body referencing both a local
  PNG and an https URL; verified the worker pipeline produces a
  multimodal user turn with 1 text part + 2 image_url parts (data URL
  for the local file, passthrough URL for the remote).
2026-05-28 17:50:42 -07:00

239 lines
8.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Worker-side image enrichment for kanban tasks.
When a kanban task body contains a local image path or an ``http(s)://``
image URL, the worker must surface that image to the model on its first
user turn — matching the CLI/gateway behaviour for inbound images.
The dispatcher spawns the worker as
``hermes -p <profile> chat -q "work kanban task <id>"``. The task body
itself never appears in argv; the worker has to read it from the kanban
DB during startup. These tests cover the round-trip:
task body → kanban_db.get_task → extract_image_refs →
build_native_content_parts → multimodal user turn
"""
from __future__ import annotations
import base64
from pathlib import Path
import pytest
from hermes_cli import kanban_db as kb
from agent.image_routing import (
build_native_content_parts,
extract_image_refs,
)
# Tiny 1×1 transparent PNG used to back any path the tests stick into a
# task body. extract_image_refs validates the path exists on disk, so the
# byte content has to be a real readable file (any image bytes will do).
_PNG = base64.b64decode(
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGNgYGBgAAAABQABpfZFQAAAAABJRU5ErkJggg=="
)
@pytest.fixture
def kanban_home(tmp_path: Path, monkeypatch):
"""Isolated HERMES_HOME with a fresh kanban DB for each test."""
home = tmp_path / ".hermes"
home.mkdir()
monkeypatch.setenv("HERMES_HOME", str(home))
monkeypatch.setattr(Path, "home", lambda: tmp_path)
kb.init_db()
return home
def _add_task_with_body(body: str, *, title: str = "Look at this") -> str:
conn = kb.connect()
try:
task_id = kb.create_task(
conn,
title=title,
body=body,
assignee="worker-a",
tenant=None,
)
finally:
conn.close()
return task_id
def _read_body(task_id: str) -> str:
conn = kb.connect()
try:
task = kb.get_task(conn, task_id)
return (task.body if task is not None else "") or ""
finally:
conn.close()
class TestExtractFromTaskBody:
"""Read a real kanban task body and run it through extract_image_refs."""
def test_local_path_in_body_round_trips(self, kanban_home, tmp_path):
img = tmp_path / "screenshot.png"
img.write_bytes(_PNG)
tid = _add_task_with_body(
f"Please review the screenshot at {img} and confirm "
"the alignment is right."
)
body = _read_body(tid)
paths, urls = extract_image_refs(body)
assert paths == [str(img)]
assert urls == []
def test_url_in_body_round_trips(self, kanban_home):
tid = _add_task_with_body(
"The design lives at https://example.com/mock/v3.png — "
"make the implementation match it."
)
body = _read_body(tid)
paths, urls = extract_image_refs(body)
assert paths == []
assert urls == ["https://example.com/mock/v3.png"]
def test_mixed_path_and_url_in_body(self, kanban_home, tmp_path):
img = tmp_path / "current.png"
img.write_bytes(_PNG)
tid = _add_task_with_body(
f"Compare the current screenshot {img} against the design at "
"https://example.com/target.png and write a diff."
)
body = _read_body(tid)
paths, urls = extract_image_refs(body)
assert paths == [str(img)]
assert urls == ["https://example.com/target.png"]
def test_body_without_images_yields_nothing(self, kanban_home):
tid = _add_task_with_body(
"Refactor the auth module to use the new session helper."
)
body = _read_body(tid)
paths, urls = extract_image_refs(body)
assert paths == []
assert urls == []
def test_empty_body_is_safe(self, kanban_home):
tid = _add_task_with_body("")
body = _read_body(tid)
paths, urls = extract_image_refs(body)
assert paths == []
assert urls == []
class TestBuildPartsFromTaskBody:
"""Verify the full pipeline produces a multimodal user turn."""
def test_local_path_becomes_native_image_part(self, kanban_home, tmp_path):
img = tmp_path / "design.png"
img.write_bytes(_PNG)
tid = _add_task_with_body(f"Check out {img} — what's broken?")
body = _read_body(tid)
paths, urls = extract_image_refs(body)
# Mirrors the cli.py wiring: pass the worker's literal -q argument
# (the dispatcher uses ``"work kanban task <id>"``) plus the
# extracted refs through build_native_content_parts.
parts, skipped = build_native_content_parts(
f"work kanban task {tid}",
paths,
image_urls=urls or None,
)
assert skipped == []
# text part + one image_url part
assert len(parts) == 2
assert parts[0]["type"] == "text"
assert parts[0]["text"].startswith(f"work kanban task {tid}")
assert f"[Image attached at: {img}]" in parts[0]["text"]
assert parts[1]["type"] == "image_url"
assert parts[1]["image_url"]["url"].startswith("data:image/png;base64,")
def test_url_becomes_image_url_part(self, kanban_home):
tid = _add_task_with_body(
"Reference: https://example.com/target.jpg — match it."
)
body = _read_body(tid)
paths, urls = extract_image_refs(body)
parts, skipped = build_native_content_parts(
f"work kanban task {tid}",
paths,
image_urls=urls or None,
)
assert skipped == []
assert len(parts) == 2
assert parts[0]["type"] == "text"
assert "[Image attached: https://example.com/target.jpg]" in parts[0]["text"]
assert parts[1] == {
"type": "image_url",
"image_url": {"url": "https://example.com/target.jpg"},
}
def test_body_with_both_yields_two_image_parts(self, kanban_home, tmp_path):
img = tmp_path / "local.png"
img.write_bytes(_PNG)
tid = _add_task_with_body(
f"Diff {img} vs https://example.com/target.png — explain it."
)
body = _read_body(tid)
paths, urls = extract_image_refs(body)
parts, skipped = build_native_content_parts(
f"work kanban task {tid}",
paths,
image_urls=urls or None,
)
assert skipped == []
image_parts = [p for p in parts if p.get("type") == "image_url"]
assert len(image_parts) == 2
# Local file is embedded as a data URL; remote URL passes through.
assert image_parts[0]["image_url"]["url"].startswith("data:image/png;base64,")
assert image_parts[1]["image_url"]["url"] == "https://example.com/target.png"
def test_body_with_no_images_leaves_query_untouched(self, kanban_home):
tid = _add_task_with_body(
"Rewrite the README intro paragraph to focus on use cases."
)
body = _read_body(tid)
paths, urls = extract_image_refs(body)
parts, skipped = build_native_content_parts(
f"work kanban task {tid}",
paths,
image_urls=urls or None,
)
# No images → plain text-only return (single part, no list mutation).
assert skipped == []
assert len(parts) == 1
assert parts[0]["type"] == "text"
assert parts[0]["text"] == f"work kanban task {tid}"
def test_code_block_example_is_not_attached(self, kanban_home, tmp_path):
# Only the real image outside the fenced code block should attach.
real = tmp_path / "real.png"
real.write_bytes(_PNG)
tid = _add_task_with_body(
f"Real screenshot:\n{real}\n\n"
"Example we DON'T want attached:\n"
"```\n"
"image: /tmp/example_only.png\n"
"url: https://example.com/example.png\n"
"```\n"
)
body = _read_body(tid)
paths, urls = extract_image_refs(body)
assert paths == [str(real)]
assert urls == []