test(tool-search): add live A/B harness, drop checked-in transcripts

Brings in the tool_search live-test harness from the original PR but leaves out the 11 checked-in scripts/out/*.json transcript files — those are non-deterministic model output that goes stale the moment the model changes and were the bulk of the diff. scripts/out/ is now gitignored so a harness run never re-commits them. Fixes on top: - API-key loading goes through hermes_cli.env_loader.load_hermes_dotenv instead of hand-parsing ~/.hermes/.env and assigning the value to a local. The canonical loader never materializes the secret in a local variable in this module, which clears the four CodeQL high alerts (py/clear-text-storage / py/clear-text-logging-sensitive-data at the transcript write/print sites — they were tracing the key from the hand-rolled parser into the records) and removes a hand-rolled parser. - encoding='utf-8' on every write_text/read_text in both harness scripts (Windows-footgun hygiene). Co-authored-by: teknium1 <127238744+teknium1@users.noreply.github.com>
2026-05-29 01:28:22 -07:00
parent 7427b9d581
commit 1709776120
4 changed files with 690 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -92,3 +92,7 @@ docs/superpowers/*
 # also created in-repo when an agent operates in this checkout). Plans, audit
 # logs, and per-session caches are never artifacts of the codebase.
 .hermes/
+
+# Tool Search live-test harness output — non-deterministic model transcripts,
+# regenerated by scripts/tool_search_livetest.py. Never an artifact of the repo.
+scripts/out/
--- a/scripts/LIVETEST_README.md
+++ b/scripts/LIVETEST_README.md
@ -0,0 +1,45 @@
+# Tool Search live test harness
+
+Runs five scenarios against a real model (Claude Haiku 4.5 via OpenRouter) to
+verify that the bridge tools work end-to-end. Records transcripts in
+`scripts/out/`.
+
+## Running
+
+```bash
+cd <repo root>
+python3 scripts/tool_search_livetest.py        # runs all 5 scenarios x 2 modes
+python3 scripts/analyze_livetest.py            # side-by-side report
+```
+
+Requires `OPENROUTER_API_KEY` set or present in `~/.hermes/.env`.
+
+## What it verifies
+
+| Scenario | Tests |
+|----------|-------|
+| A obvious_single | BM25 retrieval on an obvious tool name (github_create_issue) |
+| B vague_paraphrased | Retrieval when the model has to paraphrase ("schedule meeting" → evt_create) |
+| C multi_tool_chain | Multi-step task chaining two deferred tools (GitHub + Slack) |
+| D core_plus_deferred | Mixed: core tool (read_file) called directly, deferred tool (Slack) via bridge |
+| E no_tool_needed | Pure-knowledge prompt; verify no spurious tool_search invocations |
+
+Each scenario runs with `tool_search.enabled = on` and again with `off` for an
+A/B baseline. The harness records:
+
+- bridge_calls (the tool_search / tool_describe / tool_call sequence the model emitted)
+- underlying_tool_calls (what actually ran through the registry dispatcher)
+- final_response, iteration count, elapsed time, any errors
+
+## Output structure
+
+```
+scripts/out/
+  <scenario>__enabled.json    # tool_search ON
+  <scenario>__disabled.json   # tool_search OFF
+  _summary.json               # one-line summary across all runs
+```
+
+The 2026-05 baseline run is checked in for reference. Re-running may produce
+slightly different transcripts (the model is non-deterministic) but the
+expected_underlying_tools assertions should remain satisfied.
--- a/scripts/analyze_livetest.py
+++ b/scripts/analyze_livetest.py
@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+"""Compare enabled vs disabled runs and produce a readable report.
+
+Reads scripts/out/_summary.json and the per-scenario JSONs, prints a side-by-
+side comparison of what happened, and flags anomalies.
+"""
+
+from __future__ import annotations
+
+import json
+import sys
+from pathlib import Path
+
+
+HERE = Path(__file__).resolve().parent
+OUT = HERE / "out"
+
+
+def load_record(scenario_id: str, mode: str):
+    path = OUT / f"{scenario_id}__{mode}.json"
+    if not path.exists():
+        return None
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def fmt_tool_seq(calls):
+    if not calls:
+        return "(none)"
+    return " → ".join(c["name"] for c in calls)
+
+
+def fmt_bridge_seq(calls):
+    if not calls:
+        return "(none)"
+    parts = []
+    for c in calls:
+        if c["name"] == "tool_call":
+            inner = (c.get("args") or {}).get("name", "?")
+            parts.append(f"tool_call→{inner}")
+        elif c["name"] == "tool_search":
+            q = (c.get("args") or {}).get("query", "?")
+            parts.append(f"search('{q[:30]}')")
+        elif c["name"] == "tool_describe":
+            n = (c.get("args") or {}).get("name", "?")
+            parts.append(f"describe({n})")
+    return " → ".join(parts)
+
+
+def main():
+    if not OUT.exists():
+        print("No output directory at", OUT)
+        sys.exit(1)
+    summary_path = OUT / "_summary.json"
+    if not summary_path.exists():
+        print("No _summary.json yet")
+        sys.exit(1)
+
+    summary = json.loads(summary_path.read_text(encoding="utf-8"))
+    scenarios = sorted({row["scenario"] for row in summary})
+
+    print(f"{'='*78}")
+    print(f"  Live test results: tool_search ENABLED vs DISABLED")
+    print(f"{'='*78}\n")
+
+    fails = 0
+    for sid in scenarios:
+        en = load_record(sid, "enabled")
+        di = load_record(sid, "disabled")
+        if not en or not di:
+            continue
+        expected = set(en["expected_underlying_tools"])
+
+        print(f"┌─ {sid}  ({en['scenario_description']})")
+        print(f"│  Prompt: {en['prompt'][:120]}")
+        print(f"│  Expected underlying tools: {sorted(expected) or '(none)'}")
+        print(f"│")
+
+        for label, rec in [("ENABLED ", en), ("DISABLED", di)]:
+            called_under = [c["name"] for c in rec["underlying_tool_calls"]]
+            called_set = set(called_under)
+            missing = expected - called_set
+            extra = called_set - expected - {"read_file", "search_files", "terminal", "todo", "memory"}
+
+            mark = "✓" if (expected.issubset(called_set) and not rec["error"]) else "✗"
+            if mark == "✗":
+                fails += 1
+
+            print(f"│  {label} {mark}  bridges={len(rec['bridge_calls']):2}  underlying={len(rec['underlying_tool_calls']):2}  "
+                  f"iters={rec['n_iterations']:2}  elapsed={rec['elapsed_seconds']:5.1f}s  err={bool(rec['error'])}")
+            print(f"│    underlying: {fmt_tool_seq(rec['underlying_tool_calls'])}")
+            if rec["bridge_calls"]:
+                print(f"│    bridges:    {fmt_bridge_seq(rec['bridge_calls'])}")
+            if missing:
+                print(f"│    ⚠ MISSING expected tools: {sorted(missing)}")
+            if extra:
+                print(f"│    ⓘ extra tools called: {sorted(extra)}")
+            if rec["error"]:
+                print(f"│    💥 error: {rec['error'][:200]}")
+        # Bridge-trip count vs direct (interesting comparator)
+        en_bridges = len(en["bridge_calls"])
+        di_underlying = len(di["underlying_tool_calls"])
+        en_underlying = len(en["underlying_tool_calls"])
+        overhead = en_bridges + en_underlying - di_underlying
+        print(f"│  Δ round-trip cost: enabled used {en_bridges + en_underlying} calls vs disabled {di_underlying}  →  +{overhead}")
+        print(f"│  Final (enabled):  {(en.get('final_response') or '')[:140]}")
+        print(f"│  Final (disabled): {(di.get('final_response') or '')[:140]}")
+        print(f"└──")
+        print()
+
+    print(f"\nFails: {fails}/{2*len(scenarios)}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/tool_search_livetest.py
+++ b/scripts/tool_search_livetest.py
@ -0,0 +1,527 @@
+#!/usr/bin/env python3
+"""Live test harness for Hermes Agent's Tool Search feature.
+
+Spins up a real AIAgent against a real model, registers ~20 fake "MCP" tools
+with realistic shapes (github-like, slack-like, calendar-like, search-like),
+runs a small set of scenarios, and records exactly what the model did.
+
+For each scenario we record:
+  - the full message transcript
+  - the sequence of tool calls (name + args) the model emitted
+  - which underlying tools actually got invoked (after bridge unwrap)
+  - the final assistant response
+  - timing and round-trip count
+
+Each scenario runs twice:
+  - tool_search ENABLED  (deferred behind bridges)
+  - tool_search DISABLED (all tools loaded directly)
+
+Output: ./out/<scenario_id>__<enabled|disabled>.json
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import shutil
+import sys
+import tempfile
+import time
+import traceback
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+
+# Force-isolate the test environment BEFORE any hermes imports.
+ORIGINAL_HOME = os.environ.get("HERMES_HOME")
+ORIGINAL_AUTH = Path.home() / ".hermes" / "auth.json"
+
+_THIS_DIR = Path(__file__).resolve().parent
+_WORKTREE_ROOT = _THIS_DIR.parent
+sys.path.insert(0, str(_WORKTREE_ROOT))
+
+# ---------------------------------------------------------------------------
+# Fake MCP tools — realistic shape, varied difficulty for retrieval
+# ---------------------------------------------------------------------------
+
+FAKE_MCP_TOOLS: List[Dict[str, Any]] = [
+    # GitHub cluster
+    {
+        "name": "github_create_issue",
+        "description": "Open a new issue in a GitHub repository. Use when the user wants to report a bug or request a feature in a repo.",
+        "params": {"repo": ("string", "Repository in owner/name form"),
+                   "title": ("string", "Issue title"),
+                   "body": ("string", "Issue body in Markdown")},
+        "returns": lambda args: {"ok": True, "issue_url": f"https://github.com/{args.get('repo','x/y')}/issues/42"},
+    },
+    {
+        "name": "github_search_repos",
+        "description": "Search GitHub repositories by free-text query. Returns a ranked list of repo names with star counts.",
+        "params": {"query": ("string", "Search terms"),
+                   "limit": ("integer", "Max results")},
+        "returns": lambda args: {"results": [{"name": "fake/repo-1", "stars": 1200},
+                                             {"name": "fake/repo-2", "stars": 540}]},
+    },
+    {
+        "name": "github_close_pr",
+        "description": "Close a pull request without merging it. Use when the PR should be abandoned.",
+        "params": {"repo": ("string", ""), "pr_number": ("integer", "")},
+        "returns": lambda args: {"ok": True, "state": "closed"},
+    },
+    {
+        "name": "github_list_pulls",
+        "description": "List open pull requests for a repository.",
+        "params": {"repo": ("string", "")},
+        "returns": lambda args: {"pulls": [{"number": 31163, "title": "feat(tools): tool search"}]},
+    },
+
+    # Slack cluster
+    {
+        "name": "slack_send_message",
+        "description": "Post a message into a Slack channel as the connected workspace's app.",
+        "params": {"channel": ("string", "Channel name with leading #"),
+                   "text": ("string", "Message body")},
+        "returns": lambda args: {"ok": True, "ts": "1716528000.000100"},
+    },
+    {
+        "name": "slack_list_channels",
+        "description": "Return all channels visible to the connected Slack workspace bot.",
+        "params": {},
+        "returns": lambda args: {"channels": ["#general", "#engineering", "#random"]},
+    },
+    {
+        "name": "slack_set_status",
+        "description": "Set the current user's Slack status (emoji + text).",
+        "params": {"emoji": ("string", ""), "text": ("string", "")},
+        "returns": lambda args: {"ok": True},
+    },
+
+    # Calendar cluster (intentionally vague names to stress retrieval)
+    {
+        "name": "evt_create",
+        "description": "Add an event to the connected calendar. Used for scheduling meetings.",
+        "params": {"title": ("string", ""),
+                   "start": ("string", "ISO 8601 datetime"),
+                   "duration_min": ("integer", "")},
+        "returns": lambda args: {"ok": True, "event_id": "evt_abc"},
+    },
+    {
+        "name": "evt_list",
+        "description": "List upcoming calendar events.",
+        "params": {"max_results": ("integer", "")},
+        "returns": lambda args: {"events": [{"id": "evt_1", "title": "Standup", "start": "2026-05-25T09:00:00Z"}]},
+    },
+
+    # Knowledge / docs (paraphrased name to stress retrieval)
+    {
+        "name": "docsearch_query",
+        "description": "Search the user's internal documentation index for matching pages.",
+        "params": {"q": ("string", "Search query"), "limit": ("integer", "")},
+        "returns": lambda args: {"hits": [{"title": "Onboarding", "url": "https://docs/x"}]},
+    },
+    {
+        "name": "docsearch_fetch",
+        "description": "Fetch the full markdown content of one document by ID.",
+        "params": {"id": ("string", "")},
+        "returns": lambda args: {"content": "# Onboarding\n..."},
+    },
+
+    # Database
+    {
+        "name": "db_query",
+        "description": "Run a read-only SQL query against the analytics database.",
+        "params": {"sql": ("string", "SELECT ... statement")},
+        "returns": lambda args: {"rows": [{"id": 1, "name": "alice"}]},
+    },
+    {
+        "name": "db_describe_table",
+        "description": "Show the schema of a database table.",
+        "params": {"table": ("string", "")},
+        "returns": lambda args: {"columns": [{"name": "id", "type": "int"}, {"name": "name", "type": "text"}]},
+    },
+
+    # Linear
+    {
+        "name": "linear_create_ticket",
+        "description": "Create a new Linear issue (ticket) in the connected workspace.",
+        "params": {"title": ("string", ""), "body": ("string", ""), "priority": ("integer", "1-4")},
+        "returns": lambda args: {"ok": True, "id": "ENG-101"},
+    },
+    {
+        "name": "linear_assign",
+        "description": "Reassign a Linear ticket to a different user.",
+        "params": {"ticket_id": ("string", ""), "user": ("string", "")},
+        "returns": lambda args: {"ok": True},
+    },
+
+    # Notion
+    {
+        "name": "notion_create_page",
+        "description": "Create a new page in the connected Notion workspace.",
+        "params": {"title": ("string", ""), "body": ("string", ""), "parent": ("string", "")},
+        "returns": lambda args: {"ok": True, "page_id": "abc123"},
+    },
+
+    # Random others (filler / distractors)
+    {
+        "name": "weather_get",
+        "description": "Look up the current weather for a city.",
+        "params": {"city": ("string", "")},
+        "returns": lambda args: {"city": args.get("city", ""), "temp_c": 19, "summary": "Cloudy"},
+    },
+    {
+        "name": "translate_text",
+        "description": "Translate a short text from one language to another.",
+        "params": {"text": ("string", ""), "to": ("string", "Target language code")},
+        "returns": lambda args: {"translated": args.get("text", "") + " [translated to " + args.get("to", "??") + "]"},
+    },
+    {
+        "name": "pdf_extract",
+        "description": "Extract text from a PDF file given its path.",
+        "params": {"path": ("string", "")},
+        "returns": lambda args: {"text": "[fake PDF text]"},
+    },
+    {
+        "name": "yt_transcript",
+        "description": "Fetch the transcript for a YouTube video by URL.",
+        "params": {"url": ("string", "")},
+        "returns": lambda args: {"transcript": "[fake transcript]"},
+    },
+]
+
+
+# ---------------------------------------------------------------------------
+# Scenario definitions
+# ---------------------------------------------------------------------------
+
+SCENARIOS: List[Dict[str, Any]] = [
+    {
+        "id": "A_obvious_single",
+        "description": "Single tool, obvious name in the user request",
+        "prompt": (
+            "Open a GitHub issue in repo 'acme/widget' titled 'Crash on startup' "
+            "with body 'App crashes immediately after launch when offline.' "
+            "Then tell me you're done. Don't do anything else."
+        ),
+        "expected_underlying_tools": ["github_create_issue"],
+    },
+    {
+        "id": "B_vague_paraphrased",
+        "description": "Single tool, paraphrased intent (tests retrieval quality)",
+        "prompt": (
+            "Add a meeting to my schedule for tomorrow morning at 10am called "
+            "'Design review', 30 minutes long. Then tell me you're done. Don't do anything else."
+        ),
+        "expected_underlying_tools": ["evt_create"],
+    },
+    {
+        "id": "C_multi_tool_chain",
+        "description": "Multi-step task requiring 2-3 deferred tools",
+        "prompt": (
+            "Find the open pull requests on repo 'acme/widget', then post a "
+            "summary of how many there are to the #engineering Slack channel. "
+            "Then tell me you're done."
+        ),
+        "expected_underlying_tools": ["github_list_pulls", "slack_send_message"],
+    },
+    {
+        "id": "D_core_plus_deferred",
+        "description": "Task uses BOTH a core tool (read_file) and a deferred tool",
+        "prompt": (
+            "Read the file at /tmp/livetest/notes.txt (it exists, just read it) "
+            "and then post its contents to the #random Slack channel. Tell me you're done."
+        ),
+        "expected_underlying_tools": ["read_file", "slack_send_message"],
+        "expected_core_tool_direct": True,  # must NOT use tool_call for read_file
+    },
+    {
+        "id": "E_no_tool_needed",
+        "description": "Question doesn't need any tool — model should just answer",
+        "prompt": "What's 7 times 8? Answer with just the number.",
+        "expected_underlying_tools": [],
+    },
+]
+
+
+# ---------------------------------------------------------------------------
+# Harness
+# ---------------------------------------------------------------------------
+
+
+def setup_isolated_home(enabled: bool) -> Path:
+    """Create a fresh ~/.hermes/ for one test, copying minimal credentials.
+
+    Also reads OPENROUTER_API_KEY from the user's real ``~/.hermes/.env`` so
+    the agent can authenticate against OpenRouter inside the isolated home.
+    """
+    home_dir = Path(tempfile.mkdtemp(prefix="hermes_ts_live_"))
+    hermes_home = home_dir / ".hermes"
+    hermes_home.mkdir(parents=True)
+
+    if ORIGINAL_AUTH.exists():
+        shutil.copy(ORIGINAL_AUTH, hermes_home / "auth.json")
+
+    # Copy .env so OPENROUTER_API_KEY (or others) are visible to the agent
+    # running inside the isolated home.
+    real_env_file = Path.home() / ".hermes" / ".env"
+    if real_env_file.exists():
+        shutil.copy(real_env_file, hermes_home / ".env")
+        # Also load the real user env into this process so the provider
+        # resolver can authenticate. We go through the canonical loader
+        # (python-dotenv under the hood) rather than parsing the file by
+        # hand — it never materializes the secret in a local variable in
+        # this module, which both avoids a hand-rolled parser bug and keeps
+        # static analysis from tainting the transcript records with the key.
+        from hermes_cli.env_loader import load_hermes_dotenv
+        load_hermes_dotenv(hermes_home=str(Path.home() / ".hermes"))
+
+    cfg = {
+        "model": {
+            "provider": "openrouter",
+            "model": "anthropic/claude-haiku-4.5",
+        },
+        "tools": {
+            "tool_search": {
+                "enabled": "on" if enabled else "off",
+                "threshold_pct": 10,
+                "search_default_limit": 5,
+                "max_search_limit": 20,
+            },
+        },
+        "logging": {"level": "WARNING"},
+    }
+    (hermes_home / "config.yaml").write_text(_yaml_dump(cfg), encoding="utf-8")
+    return hermes_home
+
+
+def _yaml_dump(obj: Any) -> str:
+    try:
+        import yaml
+        return yaml.safe_dump(obj, sort_keys=False)
+    except ImportError:
+        return json.dumps(obj, indent=2)
+
+
+def register_fake_tools() -> int:
+    """Register the FAKE_MCP_TOOLS into the live tool registry."""
+    from tools.registry import registry
+
+    def make_handler(tool_def):
+        def _handler(*args, **kwargs):
+            try:
+                return json.dumps(tool_def["returns"](kwargs), ensure_ascii=False)
+            except Exception as e:
+                return json.dumps({"error": f"fake tool handler error: {e}"})
+        return _handler
+
+    count = 0
+    for tdef in FAKE_MCP_TOOLS:
+        properties = {}
+        required = []
+        for p_name, (p_type, p_desc) in tdef["params"].items():
+            properties[p_name] = {"type": p_type, "description": p_desc}
+            required.append(p_name)
+
+        registry.register(
+            name=tdef["name"],
+            toolset="mcp-fake",
+            schema={
+                "name": tdef["name"],
+                "description": tdef["description"],
+                "parameters": {
+                    "type": "object",
+                    "properties": properties,
+                    "required": required,
+                },
+            },
+            handler=make_handler(tdef),
+        )
+        count += 1
+    return count
+
+
+def reset_module_state():
+    """Drop cached modules so the new HERMES_HOME takes effect."""
+    keys = [k for k in sys.modules.keys()
+            if k.startswith(("tools.", "model_tools", "toolsets",
+                             "hermes_cli", "agent.", "run_agent"))]
+    for k in keys:
+        del sys.modules[k]
+
+
+def run_one_scenario(scenario: Dict[str, Any], enabled: bool, out_dir: Path) -> Dict[str, Any]:
+    """Run one (scenario, enabled) combination. Returns the recorded transcript."""
+    reset_module_state()
+    home = setup_isolated_home(enabled=enabled)
+    os.environ["HERMES_HOME"] = str(home)
+
+    # Pre-create the test file used by scenario D.
+    Path("/tmp/livetest").mkdir(exist_ok=True)
+    Path("/tmp/livetest/notes.txt").write_text("Hello from the test fixture.\n", encoding="utf-8")
+
+    n_registered = register_fake_tools()
+
+    # Capture tool calls via a hook on the registry dispatch path. We use the
+    # registry hook (rather than the run_agent.handle_function_call binding,
+    # which is already cached by tool_executor) because the dispatch call is
+    # the one place every underlying tool call lands. Bridge calls are
+    # extracted from the message transcript after the run.
+    tool_call_log: List[Dict[str, Any]] = []
+
+    from tools.registry import registry
+    original_dispatch = registry.dispatch
+
+    def logging_dispatch(name, args, **kw):
+        tool_call_log.append({"name": name, "args": _trim_args(args)})
+        return original_dispatch(name, args, **kw)
+    registry.dispatch = logging_dispatch
+
+    # Build agent and run
+    started = time.time()
+    error = None
+    final_response = ""
+    messages_out = []
+    try:
+        from run_agent import AIAgent
+        agent = AIAgent(
+            provider="openrouter",
+            model="anthropic/claude-haiku-4.5",
+            enabled_toolsets=None,  # Default = all available toolsets, including the registered mcp-fake tools
+            quiet_mode=True,
+            save_trajectories=False,
+            skip_context_files=True,
+            skip_memory=True,
+            platform="cli",
+            max_iterations=15,
+        )
+        result = agent.run_conversation(
+            user_message=scenario["prompt"],
+            system_message=(
+                "You are a test agent. Complete the user's task using available "
+                "tools. Be concise; don't add commentary beyond what's needed."
+            ),
+        )
+        if isinstance(result, dict):
+            final_response = result.get("final_response") or ""
+            messages_out = result.get("messages") or []
+        else:
+            final_response = str(result)
+    except Exception as e:
+        error = f"{type(e).__name__}: {e}\n{traceback.format_exc()}"
+    finally:
+        registry.dispatch = original_dispatch
+
+    elapsed = time.time() - started
+
+    # Extract bridge calls from the message transcript. Easier and more
+    # accurate than monkey-patching: this is the actual wire shape the
+    # model emitted.
+    bridge_call_log = _extract_bridge_calls(messages_out)
+
+    # Compose the trace.
+    record = {
+        "scenario_id": scenario["id"],
+        "scenario_description": scenario["description"],
+        "tool_search_enabled": enabled,
+        "model": "anthropic/claude-haiku-4.5 (via openrouter)",
+        "prompt": scenario["prompt"],
+        "expected_underlying_tools": scenario.get("expected_underlying_tools", []),
+        "n_fake_tools_registered": n_registered,
+        "elapsed_seconds": round(elapsed, 2),
+        "bridge_calls": bridge_call_log,
+        "underlying_tool_calls": tool_call_log,
+        "final_response": final_response,
+        "n_iterations": _count_assistant_turns(messages_out),
+        "error": error,
+    }
+
+    suffix = "enabled" if enabled else "disabled"
+    out_path = out_dir / f"{scenario['id']}__{suffix}.json"
+    out_path.write_text(json.dumps(record, indent=2, default=str), encoding="utf-8")
+
+    # Cleanup
+    shutil.rmtree(home.parent, ignore_errors=True)
+    return record
+
+
+def _trim_args(args: Any, max_chars: int = 300) -> Any:
+    """Trim long string args so the log stays readable."""
+    if not isinstance(args, dict):
+        return args
+    out = {}
+    for k, v in args.items():
+        if isinstance(v, str) and len(v) > max_chars:
+            out[k] = v[:max_chars] + f"...[{len(v)-max_chars} chars trimmed]"
+        else:
+            out[k] = v
+    return out
+
+
+def _count_assistant_turns(messages: List[Dict[str, Any]]) -> int:
+    return sum(1 for m in messages if isinstance(m, dict) and m.get("role") == "assistant")
+
+
+def _extract_bridge_calls(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Pull out every tool_search / tool_describe / tool_call from a transcript."""
+    bridges = ("tool_search", "tool_describe", "tool_call")
+    out: List[Dict[str, Any]] = []
+    for m in messages or []:
+        if not isinstance(m, dict) or m.get("role") != "assistant":
+            continue
+        tcs = m.get("tool_calls") or []
+        for c in tcs:
+            if not isinstance(c, dict):
+                continue
+            fn = c.get("function") or {}
+            name = fn.get("name")
+            if name in bridges:
+                raw_args = fn.get("arguments") or "{}"
+                try:
+                    args = json.loads(raw_args) if isinstance(raw_args, str) else raw_args
+                except json.JSONDecodeError:
+                    args = {"_raw": raw_args}
+                out.append({"name": name, "args": _trim_args(args)})
+    return out
+
+
+def main():
+    out_dir = _THIS_DIR / "out"
+    out_dir.mkdir(exist_ok=True)
+    print(f"Writing transcripts to: {out_dir}")
+
+    summary = []
+    for scenario in SCENARIOS:
+        for enabled in (True, False):
+            label = "enabled" if enabled else "disabled"
+            print(f"\n{'='*72}\nScenario {scenario['id']} (tool_search={label})\n{'='*72}")
+            record = run_one_scenario(scenario, enabled, out_dir)
+            n_bridge = len(record["bridge_calls"])
+            n_under = len(record["underlying_tool_calls"])
+            err = record["error"]
+            print(f"  bridge calls: {n_bridge}, underlying tool calls: {n_under}, "
+                  f"elapsed: {record['elapsed_seconds']}s, error: {bool(err)}")
+            if err:
+                print(f"  ERROR: {err[:300]}")
+            summary.append({
+                "scenario": scenario["id"],
+                "enabled": enabled,
+                "n_bridge": n_bridge,
+                "n_underlying": n_under,
+                "elapsed": record["elapsed_seconds"],
+                "error": bool(err),
+                "underlying_tools_called": [c["name"] for c in record["underlying_tool_calls"]],
+                "expected": scenario.get("expected_underlying_tools", []),
+            })
+
+    summary_path = out_dir / "_summary.json"
+    summary_path.write_text(json.dumps(summary, indent=2), encoding="utf-8")
+    print(f"\nSummary saved to: {summary_path}")
+
+    # Restore original HERMES_HOME
+    if ORIGINAL_HOME is not None:
+        os.environ["HERMES_HOME"] = ORIGINAL_HOME
+    else:
+        os.environ.pop("HERMES_HOME", None)
+
+
+if __name__ == "__main__":
+    main()