diff --git a/agent/prompt_builder.py b/agent/prompt_builder.py index fbb5f0fa0..80af3b64d 100644 --- a/agent/prompt_builder.py +++ b/agent/prompt_builder.py @@ -189,6 +189,46 @@ TOOL_USE_ENFORCEMENT_GUIDANCE = ( # Add new patterns here when a model family needs explicit steering. TOOL_USE_ENFORCEMENT_MODELS = ("gpt", "codex", "gemini", "gemma") +# OpenAI GPT/Codex-specific execution guidance. Addresses known failure modes +# where GPT models abandon work on partial results, skip prerequisite lookups, +# hallucinate instead of using tools, and declare "done" without verification. +# Inspired by patterns from OpenAI's GPT-5.4 prompting guide & OpenClaw PR #38953. +OPENAI_MODEL_EXECUTION_GUIDANCE = ( + "# Execution discipline\n" + "\n" + "- Use tools whenever they improve correctness, completeness, or grounding.\n" + "- Do not stop early when another tool call would materially improve the result.\n" + "- If a tool returns empty or partial results, retry with a different query or " + "strategy before giving up.\n" + "- Keep calling tools until: (1) the task is complete, AND (2) you have verified " + "the result.\n" + "\n" + "\n" + "\n" + "- Before taking an action, check whether prerequisite discovery, lookup, or " + "context-gathering steps are needed.\n" + "- Do not skip prerequisite steps just because the final action seems obvious.\n" + "- If a task depends on output from a prior step, resolve that dependency first.\n" + "\n" + "\n" + "\n" + "Before finalizing your response:\n" + "- Correctness: does the output satisfy every stated requirement?\n" + "- Grounding: are factual claims backed by tool outputs or provided context?\n" + "- Formatting: does the output match the requested format or schema?\n" + "- Safety: if the next step has side effects (file writes, commands, API calls), " + "confirm scope before executing.\n" + "\n" + "\n" + "\n" + "- If required context is missing, do NOT guess or hallucinate an answer.\n" + "- Use the appropriate lookup tool when missing information is retrievable " + "(search_files, web_search, read_file, etc.).\n" + "- Ask a clarifying question only when the information cannot be retrieved by tools.\n" + "- If you must proceed with incomplete information, label assumptions explicitly.\n" + "" +) + # Gemini/Gemma-specific operational guidance, adapted from OpenCode's gemini.txt. # Injected alongside TOOL_USE_ENFORCEMENT_GUIDANCE when the model is Gemini or Gemma. GOOGLE_MODEL_OPERATIONAL_GUIDANCE = ( diff --git a/run_agent.py b/run_agent.py index 619796c97..9aca26067 100644 --- a/run_agent.py +++ b/run_agent.py @@ -90,7 +90,7 @@ from agent.model_metadata import ( from agent.context_compressor import ContextCompressor from agent.subdirectory_hints import SubdirectoryHintTracker from agent.prompt_caching import apply_anthropic_cache_control -from agent.prompt_builder import build_skills_system_prompt, build_context_files_prompt, load_soul_md, TOOL_USE_ENFORCEMENT_GUIDANCE, TOOL_USE_ENFORCEMENT_MODELS, DEVELOPER_ROLE_MODELS, GOOGLE_MODEL_OPERATIONAL_GUIDANCE +from agent.prompt_builder import build_skills_system_prompt, build_context_files_prompt, load_soul_md, TOOL_USE_ENFORCEMENT_GUIDANCE, TOOL_USE_ENFORCEMENT_MODELS, DEVELOPER_ROLE_MODELS, GOOGLE_MODEL_OPERATIONAL_GUIDANCE, OPENAI_MODEL_EXECUTION_GUIDANCE from agent.usage_pricing import estimate_usage_cost, normalize_usage from agent.display import ( KawaiiSpinner, build_tool_preview as _build_tool_preview, @@ -2791,11 +2791,15 @@ class AIAgent: _inject = any(p in model_lower for p in TOOL_USE_ENFORCEMENT_MODELS) if _inject: prompt_parts.append(TOOL_USE_ENFORCEMENT_GUIDANCE) + _model_lower = (self.model or "").lower() # Google model operational guidance (conciseness, absolute # paths, parallel tool calls, verify-before-edit, etc.) - _model_lower = (self.model or "").lower() if "gemini" in _model_lower or "gemma" in _model_lower: prompt_parts.append(GOOGLE_MODEL_OPERATIONAL_GUIDANCE) + # OpenAI GPT/Codex execution discipline (tool persistence, + # prerequisite checks, verification, anti-hallucination). + if "gpt" in _model_lower or "codex" in _model_lower: + prompt_parts.append(OPENAI_MODEL_EXECUTION_GUIDANCE) # so it can refer the user to them rather than reinventing answers. diff --git a/tests/agent/test_prompt_builder.py b/tests/agent/test_prompt_builder.py index 791f7ea0e..ce8084709 100644 --- a/tests/agent/test_prompt_builder.py +++ b/tests/agent/test_prompt_builder.py @@ -23,6 +23,7 @@ from agent.prompt_builder import ( DEFAULT_AGENT_IDENTITY, TOOL_USE_ENFORCEMENT_GUIDANCE, TOOL_USE_ENFORCEMENT_MODELS, + OPENAI_MODEL_EXECUTION_GUIDANCE, MEMORY_GUIDANCE, SESSION_SEARCH_GUIDANCE, PLATFORM_HINTS, @@ -1021,6 +1022,41 @@ class TestToolUseEnforcementGuidance: assert isinstance(TOOL_USE_ENFORCEMENT_MODELS, tuple) +class TestOpenAIModelExecutionGuidance: + """Tests for GPT/Codex-specific execution discipline guidance.""" + + def test_guidance_covers_tool_persistence(self): + text = OPENAI_MODEL_EXECUTION_GUIDANCE.lower() + assert "tool_persistence" in text + assert "retry" in text + assert "empty" in text or "partial" in text + + def test_guidance_covers_prerequisite_checks(self): + text = OPENAI_MODEL_EXECUTION_GUIDANCE.lower() + assert "prerequisite" in text + assert "dependency" in text + + def test_guidance_covers_verification(self): + text = OPENAI_MODEL_EXECUTION_GUIDANCE.lower() + assert "verification" in text or "verify" in text + assert "correctness" in text + + def test_guidance_covers_missing_context(self): + text = OPENAI_MODEL_EXECUTION_GUIDANCE.lower() + assert "missing_context" in text or "missing context" in text + assert "hallucinate" in text or "guess" in text + + def test_guidance_uses_xml_tags(self): + assert "" in OPENAI_MODEL_EXECUTION_GUIDANCE + assert "" in OPENAI_MODEL_EXECUTION_GUIDANCE + assert "" in OPENAI_MODEL_EXECUTION_GUIDANCE + assert "" in OPENAI_MODEL_EXECUTION_GUIDANCE + + def test_guidance_is_string(self): + assert isinstance(OPENAI_MODEL_EXECUTION_GUIDANCE, str) + assert len(OPENAI_MODEL_EXECUTION_GUIDANCE) > 100 + + # ========================================================================= # Budget warning history stripping # =========================================================================