diff --git a/.gitignore b/.gitignore index d7a2c67c1..80984656b 100644 --- a/.gitignore +++ b/.gitignore @@ -92,3 +92,7 @@ docs/superpowers/* # also created in-repo when an agent operates in this checkout). Plans, audit # logs, and per-session caches are never artifacts of the codebase. .hermes/ + +# Tool Search live-test harness output — non-deterministic model transcripts, +# regenerated by scripts/tool_search_livetest.py. Never an artifact of the repo. +scripts/out/ diff --git a/scripts/LIVETEST_README.md b/scripts/LIVETEST_README.md new file mode 100644 index 000000000..332d5509b --- /dev/null +++ b/scripts/LIVETEST_README.md @@ -0,0 +1,45 @@ +# Tool Search live test harness + +Runs five scenarios against a real model (Claude Haiku 4.5 via OpenRouter) to +verify that the bridge tools work end-to-end. Records transcripts in +`scripts/out/`. + +## Running + +```bash +cd +python3 scripts/tool_search_livetest.py # runs all 5 scenarios x 2 modes +python3 scripts/analyze_livetest.py # side-by-side report +``` + +Requires `OPENROUTER_API_KEY` set or present in `~/.hermes/.env`. + +## What it verifies + +| Scenario | Tests | +|----------|-------| +| A obvious_single | BM25 retrieval on an obvious tool name (github_create_issue) | +| B vague_paraphrased | Retrieval when the model has to paraphrase ("schedule meeting" → evt_create) | +| C multi_tool_chain | Multi-step task chaining two deferred tools (GitHub + Slack) | +| D core_plus_deferred | Mixed: core tool (read_file) called directly, deferred tool (Slack) via bridge | +| E no_tool_needed | Pure-knowledge prompt; verify no spurious tool_search invocations | + +Each scenario runs with `tool_search.enabled = on` and again with `off` for an +A/B baseline. The harness records: + +- bridge_calls (the tool_search / tool_describe / tool_call sequence the model emitted) +- underlying_tool_calls (what actually ran through the registry dispatcher) +- final_response, iteration count, elapsed time, any errors + +## Output structure + +``` +scripts/out/ + __enabled.json # tool_search ON + __disabled.json # tool_search OFF + _summary.json # one-line summary across all runs +``` + +The 2026-05 baseline run is checked in for reference. Re-running may produce +slightly different transcripts (the model is non-deterministic) but the +expected_underlying_tools assertions should remain satisfied. diff --git a/scripts/analyze_livetest.py b/scripts/analyze_livetest.py new file mode 100644 index 000000000..f11dae197 --- /dev/null +++ b/scripts/analyze_livetest.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +"""Compare enabled vs disabled runs and produce a readable report. + +Reads scripts/out/_summary.json and the per-scenario JSONs, prints a side-by- +side comparison of what happened, and flags anomalies. +""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path + + +HERE = Path(__file__).resolve().parent +OUT = HERE / "out" + + +def load_record(scenario_id: str, mode: str): + path = OUT / f"{scenario_id}__{mode}.json" + if not path.exists(): + return None + return json.loads(path.read_text(encoding="utf-8")) + + +def fmt_tool_seq(calls): + if not calls: + return "(none)" + return " → ".join(c["name"] for c in calls) + + +def fmt_bridge_seq(calls): + if not calls: + return "(none)" + parts = [] + for c in calls: + if c["name"] == "tool_call": + inner = (c.get("args") or {}).get("name", "?") + parts.append(f"tool_call→{inner}") + elif c["name"] == "tool_search": + q = (c.get("args") or {}).get("query", "?") + parts.append(f"search('{q[:30]}')") + elif c["name"] == "tool_describe": + n = (c.get("args") or {}).get("name", "?") + parts.append(f"describe({n})") + return " → ".join(parts) + + +def main(): + if not OUT.exists(): + print("No output directory at", OUT) + sys.exit(1) + summary_path = OUT / "_summary.json" + if not summary_path.exists(): + print("No _summary.json yet") + sys.exit(1) + + summary = json.loads(summary_path.read_text(encoding="utf-8")) + scenarios = sorted({row["scenario"] for row in summary}) + + print(f"{'='*78}") + print(f" Live test results: tool_search ENABLED vs DISABLED") + print(f"{'='*78}\n") + + fails = 0 + for sid in scenarios: + en = load_record(sid, "enabled") + di = load_record(sid, "disabled") + if not en or not di: + continue + expected = set(en["expected_underlying_tools"]) + + print(f"┌─ {sid} ({en['scenario_description']})") + print(f"│ Prompt: {en['prompt'][:120]}") + print(f"│ Expected underlying tools: {sorted(expected) or '(none)'}") + print(f"│") + + for label, rec in [("ENABLED ", en), ("DISABLED", di)]: + called_under = [c["name"] for c in rec["underlying_tool_calls"]] + called_set = set(called_under) + missing = expected - called_set + extra = called_set - expected - {"read_file", "search_files", "terminal", "todo", "memory"} + + mark = "✓" if (expected.issubset(called_set) and not rec["error"]) else "✗" + if mark == "✗": + fails += 1 + + print(f"│ {label} {mark} bridges={len(rec['bridge_calls']):2} underlying={len(rec['underlying_tool_calls']):2} " + f"iters={rec['n_iterations']:2} elapsed={rec['elapsed_seconds']:5.1f}s err={bool(rec['error'])}") + print(f"│ underlying: {fmt_tool_seq(rec['underlying_tool_calls'])}") + if rec["bridge_calls"]: + print(f"│ bridges: {fmt_bridge_seq(rec['bridge_calls'])}") + if missing: + print(f"│ ⚠ MISSING expected tools: {sorted(missing)}") + if extra: + print(f"│ ⓘ extra tools called: {sorted(extra)}") + if rec["error"]: + print(f"│ 💥 error: {rec['error'][:200]}") + # Bridge-trip count vs direct (interesting comparator) + en_bridges = len(en["bridge_calls"]) + di_underlying = len(di["underlying_tool_calls"]) + en_underlying = len(en["underlying_tool_calls"]) + overhead = en_bridges + en_underlying - di_underlying + print(f"│ Δ round-trip cost: enabled used {en_bridges + en_underlying} calls vs disabled {di_underlying} → +{overhead}") + print(f"│ Final (enabled): {(en.get('final_response') or '')[:140]}") + print(f"│ Final (disabled): {(di.get('final_response') or '')[:140]}") + print(f"└──") + print() + + print(f"\nFails: {fails}/{2*len(scenarios)}") + + +if __name__ == "__main__": + main() diff --git a/scripts/tool_search_livetest.py b/scripts/tool_search_livetest.py new file mode 100644 index 000000000..98318454d --- /dev/null +++ b/scripts/tool_search_livetest.py @@ -0,0 +1,527 @@ +#!/usr/bin/env python3 +"""Live test harness for Hermes Agent's Tool Search feature. + +Spins up a real AIAgent against a real model, registers ~20 fake "MCP" tools +with realistic shapes (github-like, slack-like, calendar-like, search-like), +runs a small set of scenarios, and records exactly what the model did. + +For each scenario we record: + - the full message transcript + - the sequence of tool calls (name + args) the model emitted + - which underlying tools actually got invoked (after bridge unwrap) + - the final assistant response + - timing and round-trip count + +Each scenario runs twice: + - tool_search ENABLED (deferred behind bridges) + - tool_search DISABLED (all tools loaded directly) + +Output: ./out/__.json +""" + +from __future__ import annotations + +import json +import os +import shutil +import sys +import tempfile +import time +import traceback +from pathlib import Path +from typing import Any, Dict, List, Tuple + +# Force-isolate the test environment BEFORE any hermes imports. +ORIGINAL_HOME = os.environ.get("HERMES_HOME") +ORIGINAL_AUTH = Path.home() / ".hermes" / "auth.json" + +_THIS_DIR = Path(__file__).resolve().parent +_WORKTREE_ROOT = _THIS_DIR.parent +sys.path.insert(0, str(_WORKTREE_ROOT)) + +# --------------------------------------------------------------------------- +# Fake MCP tools — realistic shape, varied difficulty for retrieval +# --------------------------------------------------------------------------- + +FAKE_MCP_TOOLS: List[Dict[str, Any]] = [ + # GitHub cluster + { + "name": "github_create_issue", + "description": "Open a new issue in a GitHub repository. Use when the user wants to report a bug or request a feature in a repo.", + "params": {"repo": ("string", "Repository in owner/name form"), + "title": ("string", "Issue title"), + "body": ("string", "Issue body in Markdown")}, + "returns": lambda args: {"ok": True, "issue_url": f"https://github.com/{args.get('repo','x/y')}/issues/42"}, + }, + { + "name": "github_search_repos", + "description": "Search GitHub repositories by free-text query. Returns a ranked list of repo names with star counts.", + "params": {"query": ("string", "Search terms"), + "limit": ("integer", "Max results")}, + "returns": lambda args: {"results": [{"name": "fake/repo-1", "stars": 1200}, + {"name": "fake/repo-2", "stars": 540}]}, + }, + { + "name": "github_close_pr", + "description": "Close a pull request without merging it. Use when the PR should be abandoned.", + "params": {"repo": ("string", ""), "pr_number": ("integer", "")}, + "returns": lambda args: {"ok": True, "state": "closed"}, + }, + { + "name": "github_list_pulls", + "description": "List open pull requests for a repository.", + "params": {"repo": ("string", "")}, + "returns": lambda args: {"pulls": [{"number": 31163, "title": "feat(tools): tool search"}]}, + }, + + # Slack cluster + { + "name": "slack_send_message", + "description": "Post a message into a Slack channel as the connected workspace's app.", + "params": {"channel": ("string", "Channel name with leading #"), + "text": ("string", "Message body")}, + "returns": lambda args: {"ok": True, "ts": "1716528000.000100"}, + }, + { + "name": "slack_list_channels", + "description": "Return all channels visible to the connected Slack workspace bot.", + "params": {}, + "returns": lambda args: {"channels": ["#general", "#engineering", "#random"]}, + }, + { + "name": "slack_set_status", + "description": "Set the current user's Slack status (emoji + text).", + "params": {"emoji": ("string", ""), "text": ("string", "")}, + "returns": lambda args: {"ok": True}, + }, + + # Calendar cluster (intentionally vague names to stress retrieval) + { + "name": "evt_create", + "description": "Add an event to the connected calendar. Used for scheduling meetings.", + "params": {"title": ("string", ""), + "start": ("string", "ISO 8601 datetime"), + "duration_min": ("integer", "")}, + "returns": lambda args: {"ok": True, "event_id": "evt_abc"}, + }, + { + "name": "evt_list", + "description": "List upcoming calendar events.", + "params": {"max_results": ("integer", "")}, + "returns": lambda args: {"events": [{"id": "evt_1", "title": "Standup", "start": "2026-05-25T09:00:00Z"}]}, + }, + + # Knowledge / docs (paraphrased name to stress retrieval) + { + "name": "docsearch_query", + "description": "Search the user's internal documentation index for matching pages.", + "params": {"q": ("string", "Search query"), "limit": ("integer", "")}, + "returns": lambda args: {"hits": [{"title": "Onboarding", "url": "https://docs/x"}]}, + }, + { + "name": "docsearch_fetch", + "description": "Fetch the full markdown content of one document by ID.", + "params": {"id": ("string", "")}, + "returns": lambda args: {"content": "# Onboarding\n..."}, + }, + + # Database + { + "name": "db_query", + "description": "Run a read-only SQL query against the analytics database.", + "params": {"sql": ("string", "SELECT ... statement")}, + "returns": lambda args: {"rows": [{"id": 1, "name": "alice"}]}, + }, + { + "name": "db_describe_table", + "description": "Show the schema of a database table.", + "params": {"table": ("string", "")}, + "returns": lambda args: {"columns": [{"name": "id", "type": "int"}, {"name": "name", "type": "text"}]}, + }, + + # Linear + { + "name": "linear_create_ticket", + "description": "Create a new Linear issue (ticket) in the connected workspace.", + "params": {"title": ("string", ""), "body": ("string", ""), "priority": ("integer", "1-4")}, + "returns": lambda args: {"ok": True, "id": "ENG-101"}, + }, + { + "name": "linear_assign", + "description": "Reassign a Linear ticket to a different user.", + "params": {"ticket_id": ("string", ""), "user": ("string", "")}, + "returns": lambda args: {"ok": True}, + }, + + # Notion + { + "name": "notion_create_page", + "description": "Create a new page in the connected Notion workspace.", + "params": {"title": ("string", ""), "body": ("string", ""), "parent": ("string", "")}, + "returns": lambda args: {"ok": True, "page_id": "abc123"}, + }, + + # Random others (filler / distractors) + { + "name": "weather_get", + "description": "Look up the current weather for a city.", + "params": {"city": ("string", "")}, + "returns": lambda args: {"city": args.get("city", ""), "temp_c": 19, "summary": "Cloudy"}, + }, + { + "name": "translate_text", + "description": "Translate a short text from one language to another.", + "params": {"text": ("string", ""), "to": ("string", "Target language code")}, + "returns": lambda args: {"translated": args.get("text", "") + " [translated to " + args.get("to", "??") + "]"}, + }, + { + "name": "pdf_extract", + "description": "Extract text from a PDF file given its path.", + "params": {"path": ("string", "")}, + "returns": lambda args: {"text": "[fake PDF text]"}, + }, + { + "name": "yt_transcript", + "description": "Fetch the transcript for a YouTube video by URL.", + "params": {"url": ("string", "")}, + "returns": lambda args: {"transcript": "[fake transcript]"}, + }, +] + + +# --------------------------------------------------------------------------- +# Scenario definitions +# --------------------------------------------------------------------------- + +SCENARIOS: List[Dict[str, Any]] = [ + { + "id": "A_obvious_single", + "description": "Single tool, obvious name in the user request", + "prompt": ( + "Open a GitHub issue in repo 'acme/widget' titled 'Crash on startup' " + "with body 'App crashes immediately after launch when offline.' " + "Then tell me you're done. Don't do anything else." + ), + "expected_underlying_tools": ["github_create_issue"], + }, + { + "id": "B_vague_paraphrased", + "description": "Single tool, paraphrased intent (tests retrieval quality)", + "prompt": ( + "Add a meeting to my schedule for tomorrow morning at 10am called " + "'Design review', 30 minutes long. Then tell me you're done. Don't do anything else." + ), + "expected_underlying_tools": ["evt_create"], + }, + { + "id": "C_multi_tool_chain", + "description": "Multi-step task requiring 2-3 deferred tools", + "prompt": ( + "Find the open pull requests on repo 'acme/widget', then post a " + "summary of how many there are to the #engineering Slack channel. " + "Then tell me you're done." + ), + "expected_underlying_tools": ["github_list_pulls", "slack_send_message"], + }, + { + "id": "D_core_plus_deferred", + "description": "Task uses BOTH a core tool (read_file) and a deferred tool", + "prompt": ( + "Read the file at /tmp/livetest/notes.txt (it exists, just read it) " + "and then post its contents to the #random Slack channel. Tell me you're done." + ), + "expected_underlying_tools": ["read_file", "slack_send_message"], + "expected_core_tool_direct": True, # must NOT use tool_call for read_file + }, + { + "id": "E_no_tool_needed", + "description": "Question doesn't need any tool — model should just answer", + "prompt": "What's 7 times 8? Answer with just the number.", + "expected_underlying_tools": [], + }, +] + + +# --------------------------------------------------------------------------- +# Harness +# --------------------------------------------------------------------------- + + +def setup_isolated_home(enabled: bool) -> Path: + """Create a fresh ~/.hermes/ for one test, copying minimal credentials. + + Also reads OPENROUTER_API_KEY from the user's real ``~/.hermes/.env`` so + the agent can authenticate against OpenRouter inside the isolated home. + """ + home_dir = Path(tempfile.mkdtemp(prefix="hermes_ts_live_")) + hermes_home = home_dir / ".hermes" + hermes_home.mkdir(parents=True) + + if ORIGINAL_AUTH.exists(): + shutil.copy(ORIGINAL_AUTH, hermes_home / "auth.json") + + # Copy .env so OPENROUTER_API_KEY (or others) are visible to the agent + # running inside the isolated home. + real_env_file = Path.home() / ".hermes" / ".env" + if real_env_file.exists(): + shutil.copy(real_env_file, hermes_home / ".env") + # Also load the real user env into this process so the provider + # resolver can authenticate. We go through the canonical loader + # (python-dotenv under the hood) rather than parsing the file by + # hand — it never materializes the secret in a local variable in + # this module, which both avoids a hand-rolled parser bug and keeps + # static analysis from tainting the transcript records with the key. + from hermes_cli.env_loader import load_hermes_dotenv + load_hermes_dotenv(hermes_home=str(Path.home() / ".hermes")) + + cfg = { + "model": { + "provider": "openrouter", + "model": "anthropic/claude-haiku-4.5", + }, + "tools": { + "tool_search": { + "enabled": "on" if enabled else "off", + "threshold_pct": 10, + "search_default_limit": 5, + "max_search_limit": 20, + }, + }, + "logging": {"level": "WARNING"}, + } + (hermes_home / "config.yaml").write_text(_yaml_dump(cfg), encoding="utf-8") + return hermes_home + + +def _yaml_dump(obj: Any) -> str: + try: + import yaml + return yaml.safe_dump(obj, sort_keys=False) + except ImportError: + return json.dumps(obj, indent=2) + + +def register_fake_tools() -> int: + """Register the FAKE_MCP_TOOLS into the live tool registry.""" + from tools.registry import registry + + def make_handler(tool_def): + def _handler(*args, **kwargs): + try: + return json.dumps(tool_def["returns"](kwargs), ensure_ascii=False) + except Exception as e: + return json.dumps({"error": f"fake tool handler error: {e}"}) + return _handler + + count = 0 + for tdef in FAKE_MCP_TOOLS: + properties = {} + required = [] + for p_name, (p_type, p_desc) in tdef["params"].items(): + properties[p_name] = {"type": p_type, "description": p_desc} + required.append(p_name) + + registry.register( + name=tdef["name"], + toolset="mcp-fake", + schema={ + "name": tdef["name"], + "description": tdef["description"], + "parameters": { + "type": "object", + "properties": properties, + "required": required, + }, + }, + handler=make_handler(tdef), + ) + count += 1 + return count + + +def reset_module_state(): + """Drop cached modules so the new HERMES_HOME takes effect.""" + keys = [k for k in sys.modules.keys() + if k.startswith(("tools.", "model_tools", "toolsets", + "hermes_cli", "agent.", "run_agent"))] + for k in keys: + del sys.modules[k] + + +def run_one_scenario(scenario: Dict[str, Any], enabled: bool, out_dir: Path) -> Dict[str, Any]: + """Run one (scenario, enabled) combination. Returns the recorded transcript.""" + reset_module_state() + home = setup_isolated_home(enabled=enabled) + os.environ["HERMES_HOME"] = str(home) + + # Pre-create the test file used by scenario D. + Path("/tmp/livetest").mkdir(exist_ok=True) + Path("/tmp/livetest/notes.txt").write_text("Hello from the test fixture.\n", encoding="utf-8") + + n_registered = register_fake_tools() + + # Capture tool calls via a hook on the registry dispatch path. We use the + # registry hook (rather than the run_agent.handle_function_call binding, + # which is already cached by tool_executor) because the dispatch call is + # the one place every underlying tool call lands. Bridge calls are + # extracted from the message transcript after the run. + tool_call_log: List[Dict[str, Any]] = [] + + from tools.registry import registry + original_dispatch = registry.dispatch + + def logging_dispatch(name, args, **kw): + tool_call_log.append({"name": name, "args": _trim_args(args)}) + return original_dispatch(name, args, **kw) + registry.dispatch = logging_dispatch + + # Build agent and run + started = time.time() + error = None + final_response = "" + messages_out = [] + try: + from run_agent import AIAgent + agent = AIAgent( + provider="openrouter", + model="anthropic/claude-haiku-4.5", + enabled_toolsets=None, # Default = all available toolsets, including the registered mcp-fake tools + quiet_mode=True, + save_trajectories=False, + skip_context_files=True, + skip_memory=True, + platform="cli", + max_iterations=15, + ) + result = agent.run_conversation( + user_message=scenario["prompt"], + system_message=( + "You are a test agent. Complete the user's task using available " + "tools. Be concise; don't add commentary beyond what's needed." + ), + ) + if isinstance(result, dict): + final_response = result.get("final_response") or "" + messages_out = result.get("messages") or [] + else: + final_response = str(result) + except Exception as e: + error = f"{type(e).__name__}: {e}\n{traceback.format_exc()}" + finally: + registry.dispatch = original_dispatch + + elapsed = time.time() - started + + # Extract bridge calls from the message transcript. Easier and more + # accurate than monkey-patching: this is the actual wire shape the + # model emitted. + bridge_call_log = _extract_bridge_calls(messages_out) + + # Compose the trace. + record = { + "scenario_id": scenario["id"], + "scenario_description": scenario["description"], + "tool_search_enabled": enabled, + "model": "anthropic/claude-haiku-4.5 (via openrouter)", + "prompt": scenario["prompt"], + "expected_underlying_tools": scenario.get("expected_underlying_tools", []), + "n_fake_tools_registered": n_registered, + "elapsed_seconds": round(elapsed, 2), + "bridge_calls": bridge_call_log, + "underlying_tool_calls": tool_call_log, + "final_response": final_response, + "n_iterations": _count_assistant_turns(messages_out), + "error": error, + } + + suffix = "enabled" if enabled else "disabled" + out_path = out_dir / f"{scenario['id']}__{suffix}.json" + out_path.write_text(json.dumps(record, indent=2, default=str), encoding="utf-8") + + # Cleanup + shutil.rmtree(home.parent, ignore_errors=True) + return record + + +def _trim_args(args: Any, max_chars: int = 300) -> Any: + """Trim long string args so the log stays readable.""" + if not isinstance(args, dict): + return args + out = {} + for k, v in args.items(): + if isinstance(v, str) and len(v) > max_chars: + out[k] = v[:max_chars] + f"...[{len(v)-max_chars} chars trimmed]" + else: + out[k] = v + return out + + +def _count_assistant_turns(messages: List[Dict[str, Any]]) -> int: + return sum(1 for m in messages if isinstance(m, dict) and m.get("role") == "assistant") + + +def _extract_bridge_calls(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Pull out every tool_search / tool_describe / tool_call from a transcript.""" + bridges = ("tool_search", "tool_describe", "tool_call") + out: List[Dict[str, Any]] = [] + for m in messages or []: + if not isinstance(m, dict) or m.get("role") != "assistant": + continue + tcs = m.get("tool_calls") or [] + for c in tcs: + if not isinstance(c, dict): + continue + fn = c.get("function") or {} + name = fn.get("name") + if name in bridges: + raw_args = fn.get("arguments") or "{}" + try: + args = json.loads(raw_args) if isinstance(raw_args, str) else raw_args + except json.JSONDecodeError: + args = {"_raw": raw_args} + out.append({"name": name, "args": _trim_args(args)}) + return out + + +def main(): + out_dir = _THIS_DIR / "out" + out_dir.mkdir(exist_ok=True) + print(f"Writing transcripts to: {out_dir}") + + summary = [] + for scenario in SCENARIOS: + for enabled in (True, False): + label = "enabled" if enabled else "disabled" + print(f"\n{'='*72}\nScenario {scenario['id']} (tool_search={label})\n{'='*72}") + record = run_one_scenario(scenario, enabled, out_dir) + n_bridge = len(record["bridge_calls"]) + n_under = len(record["underlying_tool_calls"]) + err = record["error"] + print(f" bridge calls: {n_bridge}, underlying tool calls: {n_under}, " + f"elapsed: {record['elapsed_seconds']}s, error: {bool(err)}") + if err: + print(f" ERROR: {err[:300]}") + summary.append({ + "scenario": scenario["id"], + "enabled": enabled, + "n_bridge": n_bridge, + "n_underlying": n_under, + "elapsed": record["elapsed_seconds"], + "error": bool(err), + "underlying_tools_called": [c["name"] for c in record["underlying_tool_calls"]], + "expected": scenario.get("expected_underlying_tools", []), + }) + + summary_path = out_dir / "_summary.json" + summary_path.write_text(json.dumps(summary, indent=2), encoding="utf-8") + print(f"\nSummary saved to: {summary_path}") + + # Restore original HERMES_HOME + if ORIGINAL_HOME is not None: + os.environ["HERMES_HOME"] = ORIGINAL_HOME + else: + os.environ.pop("HERMES_HOME", None) + + +if __name__ == "__main__": + main()