Provide example datagen config scripts

2026-02-25 02:27:41 -08:00
parent 9166d56f17
commit 9ec4f7504b
17 changed files with 71 additions and 375 deletions
--- a/datagen-config-examples/example_browser_tasks.jsonl
+++ b/datagen-config-examples/example_browser_tasks.jsonl
@ -0,0 +1,5 @@
+{"prompt": "Go to https://news.ycombinator.com and find the top 5 posts on the front page. For each post, get the title, URL, points, and number of comments. Return the results as a formatted summary."}
+{"prompt": "Navigate to https://en.wikipedia.org/wiki/Hermes and extract the first paragraph of the article, the image caption, and the list of items in the infobox. Summarize what you find."}
+{"prompt": "Go to https://github.com/trending and find the top 3 trending repositories today. For each repo, get the name, description, language, and star count. Write the results to a file called trending_repos.md."}
+{"prompt": "Visit https://httpbin.org/forms/post and fill out the form with sample data (customer name: Jane Doe, size: Medium, topping: Bacon, delivery time: 12:00). Submit the form and report what the response page shows."}
+{"prompt": "Navigate to https://books.toscrape.com, browse to the Travel category, find the highest-rated book, and extract its title, price, availability, and description."}
--- a/datagen-config-examples/run_browser_tasks.sh
+++ b/datagen-config-examples/run_browser_tasks.sh
@ -0,0 +1,65 @@
+#!/bin/bash
+
+# =============================================================================
+# Example: Browser-Focused Data Generation
+# =============================================================================
+#
+# Generates tool-calling trajectories for browser automation tasks.
+# The agent navigates websites, fills forms, extracts information, etc.
+#
+# Distribution: browser 97%, web 20%, vision 12%, terminal 15%
+#
+# Prerequisites:
+#   - OPENROUTER_API_KEY in ~/.hermes/.env
+#   - BROWSERBASE_API_KEY in ~/.hermes/.env (for browser tools)
+#   - A dataset JSONL file with one {"prompt": "..."} per line
+#
+# Usage:
+#   cd ~/.hermes/hermes-agent
+#   bash datagen-config-examples/run_browser_tasks.sh
+#
+# Output: data/browser_tasks_example/trajectories.jsonl
+# =============================================================================
+
+mkdir -p logs
+
+LOG_FILE="logs/browser_tasks_$(date +%Y%m%d_%H%M%S).log"
+echo "📝 Logging to: $LOG_FILE"
+
+# Point to the example dataset in this directory
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+
+python batch_runner.py \
+  --dataset_file="$SCRIPT_DIR/example_browser_tasks.jsonl" \
+  --batch_size=5 \
+  --run_name="browser_tasks_example" \
+  --distribution="browser_tasks" \
+  --model="anthropic/claude-sonnet-4" \
+  --base_url="https://openrouter.ai/api/v1" \
+  --num_workers=3 \
+  --max_turns=30 \
+  --ephemeral_system_prompt="You are an AI assistant with browser automation capabilities. Your primary task is to navigate and interact with web pages to accomplish user goals.
+
+IMPORTANT GUIDELINES:
+
+1. SEARCHING: Do NOT search directly on Google via the browser — they block automated searches. Use the web_search tool first to find URLs, then navigate to them with browser tools.
+
+2. COOKIE/PRIVACY DIALOGS: After navigating to a page, check for cookie consent or privacy popups. Dismiss them by clicking Accept/Close/OK before interacting with other elements. Take a fresh browser_snapshot afterward.
+
+3. HANDLING TIMEOUTS: If an action times out, the element may be blocked by an overlay. Take a new snapshot and look for dialogs to dismiss. If none, try an alternative approach or report the issue.
+
+4. GENERAL: Use browser tools to click, fill forms, and extract information. Use terminal for local file operations. Verify your actions and handle errors gracefully." \
+  2>&1 | tee "$LOG_FILE"
+
+echo "✅ Done. Log: $LOG_FILE"
+
+# =============================================================================
+# Common options you can add:
+#
+#   --resume                  Resume from checkpoint if interrupted
+#   --verbose                 Enable detailed logging
+#   --max_tokens=63000        Set max response tokens
+#   --reasoning_disabled      Disable model thinking/reasoning tokens
+#   --providers_allowed="anthropic,google"  Restrict to specific providers
+#   --prefill_messages_file="configs/prefill.json"  Few-shot priming
+# =============================================================================
--- a/datagen-config-examples/trajectory_compression.yaml
+++ b/datagen-config-examples/trajectory_compression.yaml
@ -0,0 +1,101 @@
+# Trajectory Compression Configuration
+# 
+# Post-processes completed agent trajectories to fit within a target token budget.
+# Compression preserves head/tail turns and summarizes middle content only as needed.
+
+# Tokenizer settings for accurate token counting
+tokenizer:
+  # HuggingFace tokenizer name
+  name: "moonshotai/Kimi-K2-Thinking"
+  
+  # Trust remote code (required for some tokenizers)
+  trust_remote_code: true
+
+# Compression targets and behavior
+compression:
+  # Target maximum tokens for compressed trajectory
+  target_max_tokens: 29000
+  
+  # Target size for summary (in tokens)
+  # This is factored into calculations when determining what to compress
+  summary_target_tokens: 750
+
+# Protected turns that should NEVER be compressed
+protected_turns:
+  # Always protect the first system message (tool definitions)
+  first_system: true
+  
+  # Always protect the first human message (original request)
+  first_human: true
+  
+  # Always protect the first gpt message (initial response/tool_call)
+  first_gpt: true
+  
+  # Always protect the first tool response (result of first action)
+  first_tool: true
+  
+  # Always protect the last 2 complete turn pairs (gpt+tool or gpt only)
+  # This ensures the model's final actions and conclusions are preserved
+  last_n_turns: 4
+
+# LLM settings for generating summaries (OpenRouter only)
+summarization:
+  # Model to use for summarization (should be fast and cheap)
+  # Using OpenRouter model path format
+  model: "google/gemini-3-flash-preview"
+  
+  # OpenRouter API settings
+  base_url: "https://openrouter.ai/api/v1"
+  
+  # Environment variable containing OpenRouter API key
+  api_key_env: "OPENROUTER_API_KEY"
+  
+  # Temperature for summarization (lower = more deterministic)
+  temperature: 0.3
+  
+  # Max retries for API failures
+  max_retries: 3
+  
+  # Delay between retries (seconds)
+  retry_delay: 2
+
+# Output settings
+output:
+  # Add notice to system message about potential summarization
+  add_summary_notice: true
+  
+  # Text to append to system message
+  summary_notice_text: "\n\nSome of the conversation may be summarized to preserve context."
+  
+  # Output directory suffix (appended to input directory name)
+  output_suffix: "_compressed"
+
+# Processing settings
+processing:
+  # Number of parallel workers for batch processing
+  num_workers: 4
+  
+  # Maximum concurrent API calls for summarization (async parallelism)
+  max_concurrent_requests: 50
+  
+  # Skip trajectories that are already under target length
+  skip_under_target: true
+  
+  # If true, save trajectories even if compression can't get under target
+  # (will compress as much as possible)
+  save_over_limit: true
+  
+  # Timeout per trajectory in seconds (skip if takes longer)
+  # Helps avoid hanging on problematic entries
+  per_trajectory_timeout: 300  # 5 minutes
+
+# Metrics to track
+metrics:
+  # Log detailed compression statistics
+  enabled: true
+  
+  # Save per-trajectory metrics in output
+  per_trajectory: false
+  
+  # Metrics file name (saved in output directory)
+  output_file: "compression_metrics.json"