fix(config): align prefill messages key handling

2026-06-03 23:51:44 -06:00
parent 3c163cb035
commit ffb53767bf
12 changed files with 136 additions and 22 deletions
--- a/cli.py
+++ b/cli.py
@ -313,6 +313,25 @@ def _load_prefill_messages(file_path: str) -> List[Dict[str, Any]]:
        return []


+def _resolve_prefill_messages_file(config: Dict[str, Any]) -> str:
+    """Resolve the prefill file path from env/config.
+
+    ``prefill_messages_file`` at the top level is the canonical config key.
+    ``agent.prefill_messages_file`` remains a legacy fallback for older CLI and
+    godmode-generated configs.
+    """
+    env_path = os.getenv("HERMES_PREFILL_MESSAGES_FILE", "").strip()
+    if env_path:
+        return env_path
+    top_level = str(config.get("prefill_messages_file", "") or "").strip()
+    if top_level:
+        return top_level
+    agent_cfg = config.get("agent", {})
+    if isinstance(agent_cfg, dict):
+        return str(agent_cfg.get("prefill_messages_file", "") or "").strip()
+    return ""
+
+
 def _parse_reasoning_config(effort: str) -> dict | None:
    """Parse a reasoning effort level into an OpenRouter reasoning config dict."""
    from hermes_constants import parse_reasoning_effort
@ -3272,7 +3291,7 @@ class HermesCLI:
        
        # Ephemeral prefill messages (few-shot priming, never persisted)
        self.prefill_messages = _load_prefill_messages(
-            CLI_CONFIG["agent"].get("prefill_messages_file", "")
+            _resolve_prefill_messages_file(CLI_CONFIG)
        )
        
        # Reasoning config (OpenRouter reasoning effort level)
--- a/cron/scheduler.py
+++ b/cron/scheduler.py
@ -1551,9 +1551,16 @@ def _run_job_impl(job: dict) -> tuple[bool, str, str, Optional[str]]:
        effort = str(_cfg.get("agent", {}).get("reasoning_effort", "")).strip()
        reasoning_config = parse_reasoning_effort(effort)

-        # Prefill messages from env or config.yaml
+        # Prefill messages from env or config.yaml. The top-level
+        # prefill_messages_file key is canonical; agent.prefill_messages_file is
+        # retained as a legacy fallback for older CLI/godmode configs.
        prefill_messages = None
-        prefill_file = os.getenv("HERMES_PREFILL_MESSAGES_FILE", "") or _cfg.get("prefill_messages_file", "")
+        agent_cfg = _cfg.get("agent", {}) if isinstance(_cfg.get("agent", {}), dict) else {}
+        prefill_file = (
+            os.getenv("HERMES_PREFILL_MESSAGES_FILE", "")
+            or _cfg.get("prefill_messages_file", "")
+            or agent_cfg.get("prefill_messages_file", "")
+        )
        if prefill_file:
            pfpath = Path(prefill_file).expanduser()
            if not pfpath.is_absolute():
--- a/gateway/run.py
+++ b/gateway/run.py
@ -3034,13 +3034,16 @@ class GatewayRunner:
        """Load ephemeral prefill messages from config or env var.
        
        Checks HERMES_PREFILL_MESSAGES_FILE env var first, then falls back to
-        the prefill_messages_file key in ~/.hermes/config.yaml.
+        the top-level prefill_messages_file key in ~/.hermes/config.yaml.
+        agent.prefill_messages_file is accepted as a legacy fallback.
        Relative paths are resolved from ~/.hermes/.
        """
        file_path = os.getenv("HERMES_PREFILL_MESSAGES_FILE", "")
        if not file_path:
            cfg = _load_gateway_runtime_config()
            file_path = str(cfg.get("prefill_messages_file", "") or "")
+            if not file_path:
+                file_path = str(cfg_get(cfg, "agent", "prefill_messages_file", default="") or "")
        if not file_path:
            return []
        path = Path(file_path).expanduser()
--- a/skills/red-teaming/godmode/SKILL.md
+++ b/skills/red-teaming/godmode/SKILL.md
@ -90,7 +90,7 @@ undo_jailbreak()
 7. **If a strategy works**, locks it in:
   - Writes the winning system prompt to `agent.system_prompt` in `config.yaml`
   - Writes prefill messages to `~/.hermes/prefill.json`
-   - Sets `agent.prefill_messages_file: "prefill.json"` in `config.yaml`
+   - Sets `prefill_messages_file: "prefill.json"` in `config.yaml`
 8. **Reports results** — which strategy won, score, preview of compliant response

 ### Strategy order per model family:
@ -171,8 +171,7 @@ Create `~/.hermes/prefill.json`:

 Then set in `~/.hermes/config.yaml`:
 ```yaml
-agent:
-  prefill_messages_file: "prefill.json"
+prefill_messages_file: "prefill.json"
 ```

 Prefill messages are injected at the start of every API call, after the system prompt. They are ephemeral — never saved to sessions or trajectories. The model sees them as prior conversation context, establishing a pattern of compliance.
--- a/skills/red-teaming/godmode/scripts/auto_jailbreak.py
+++ b/skills/red-teaming/godmode/scripts/auto_jailbreak.py
@ -397,7 +397,8 @@ def _write_config(system_prompt: str = None, prefill_file: str = None):
        cfg["agent"]["system_prompt"] = system_prompt

    if prefill_file is not None:
-        cfg["agent"]["prefill_messages_file"] = prefill_file
+        cfg["prefill_messages_file"] = prefill_file
+        cfg["agent"].pop("prefill_messages_file", None)

    with open(CONFIG_PATH, "w") as f:
        yaml.dump(cfg, f, default_flow_style=False, allow_unicode=True,
@ -721,6 +722,7 @@ def undo_jailbreak(verbose=True):
            if "agent" in cfg:
                cfg["agent"].pop("system_prompt", None)
                cfg["agent"].pop("prefill_messages_file", None)
+            cfg.pop("prefill_messages_file", None)
            with open(CONFIG_PATH, "w") as f:
                yaml.dump(cfg, f, default_flow_style=False, allow_unicode=True,
                          width=120, sort_keys=False)
--- a/tests/cli/test_prefill_config.py
+++ b/tests/cli/test_prefill_config.py
@ -0,0 +1,35 @@
+"""Regression tests for CLI prefill config key compatibility."""
+
+from __future__ import annotations
+
+import cli
+
+
+def test_resolve_prefill_messages_file_uses_top_level(monkeypatch):
+    monkeypatch.delenv("HERMES_PREFILL_MESSAGES_FILE", raising=False)
+
+    assert cli._resolve_prefill_messages_file(
+        {
+            "prefill_messages_file": "top.json",
+            "agent": {"prefill_messages_file": "legacy.json"},
+        }
+    ) == "top.json"
+
+
+def test_resolve_prefill_messages_file_accepts_legacy_agent_key(monkeypatch):
+    monkeypatch.delenv("HERMES_PREFILL_MESSAGES_FILE", raising=False)
+
+    assert cli._resolve_prefill_messages_file(
+        {"agent": {"prefill_messages_file": "legacy.json"}}
+    ) == "legacy.json"
+
+
+def test_resolve_prefill_messages_file_prefers_env(monkeypatch):
+    monkeypatch.setenv("HERMES_PREFILL_MESSAGES_FILE", "env.json")
+
+    assert cli._resolve_prefill_messages_file(
+        {
+            "prefill_messages_file": "top.json",
+            "agent": {"prefill_messages_file": "legacy.json"},
+        }
+    ) == "env.json"
--- a/tests/cron/test_scheduler.py
+++ b/tests/cron/test_scheduler.py
@ -1546,6 +1546,36 @@ class TestRunJobConfigEnvVarExpansion:
            "config.yaml ${VAR} was not expanded in the cron execution path."
        )

+    def test_legacy_agent_prefill_messages_file_is_loaded(self, tmp_path, monkeypatch):
+        """Cron accepts the legacy agent.prefill_messages_file fallback."""
+        prefill = [{"role": "system", "content": "legacy cron prefill"}]
+        (tmp_path / "prefill.json").write_text(json.dumps(prefill), encoding="utf-8")
+        (tmp_path / "config.yaml").write_text(
+            "agent:\n"
+            "  prefill_messages_file: prefill.json\n",
+            encoding="utf-8",
+        )
+
+        job = {"id": "prefill-job", "name": "prefill test", "prompt": "hi"}
+        fake_db = MagicMock()
+
+        with patch("cron.scheduler._hermes_home", tmp_path), \
+             patch("cron.scheduler._resolve_origin", return_value=None), \
+             patch("dotenv.load_dotenv"), \
+             patch("hermes_state.SessionDB", return_value=fake_db), \
+             patch("hermes_cli.runtime_provider.resolve_runtime_provider",
+                   return_value=self._RUNTIME), \
+             patch("tools.mcp_tool.discover_mcp_tools", return_value=[]), \
+             patch("run_agent.AIAgent") as mock_agent_cls:
+            mock_agent = MagicMock()
+            mock_agent.run_conversation.return_value = {"final_response": "ok"}
+            mock_agent_cls.return_value = mock_agent
+            success, _, _, error = run_job(job)
+
+        assert success is True
+        assert error is None
+        assert mock_agent_cls.call_args.kwargs["prefill_messages"] == prefill
+
    def test_fallback_model_env_ref_in_config_yaml_is_expanded(self, tmp_path, monkeypatch):
        """${VAR} in config.yaml fallback_providers model: is expanded."""
        (tmp_path / "config.yaml").write_text(
--- a/tests/gateway/test_runtime_config_env_expansion.py
+++ b/tests/gateway/test_runtime_config_env_expansion.py
@ -33,6 +33,29 @@ def test_load_prefill_messages_expands_env_var_path(monkeypatch, gateway_home):
    assert gateway_run.GatewayRunner._load_prefill_messages() == prefill


+def test_load_prefill_messages_accepts_legacy_agent_key(monkeypatch, gateway_home):
+    prefill = [{"role": "system", "content": "legacy few-shot"}]
+    (gateway_home / "prefill.json").write_text(json.dumps(prefill), encoding="utf-8")
+    _write_config(gateway_home, "agent:\n  prefill_messages_file: prefill.json\n")
+
+    assert gateway_run.GatewayRunner._load_prefill_messages() == prefill
+
+
+def test_load_prefill_messages_prefers_top_level_over_legacy(monkeypatch, gateway_home):
+    top_level = [{"role": "system", "content": "top-level"}]
+    legacy = [{"role": "system", "content": "legacy"}]
+    (gateway_home / "top.json").write_text(json.dumps(top_level), encoding="utf-8")
+    (gateway_home / "legacy.json").write_text(json.dumps(legacy), encoding="utf-8")
+    _write_config(
+        gateway_home,
+        "prefill_messages_file: top.json\n"
+        "agent:\n"
+        "  prefill_messages_file: legacy.json\n",
+    )
+
+    assert gateway_run.GatewayRunner._load_prefill_messages() == top_level
+
+
@pytest.mark.parametrize(
    ("config_body", "env_name", "env_value", "loader_name", "expected"),
    [
--- a/website/docs/user-guide/skills/bundled/red-teaming/red-teaming-godmode.md
+++ b/website/docs/user-guide/skills/bundled/red-teaming/red-teaming-godmode.md
@ -108,7 +108,7 @@ undo_jailbreak()
 7. **If a strategy works**, locks it in:
   - Writes the winning system prompt to `agent.system_prompt` in `config.yaml`
   - Writes prefill messages to `~/.hermes/prefill.json`
-   - Sets `agent.prefill_messages_file: "prefill.json"` in `config.yaml`
+   - Sets `prefill_messages_file: "prefill.json"` in `config.yaml`
 8. **Reports results** — which strategy won, score, preview of compliant response

 ### Strategy order per model family:
@ -189,8 +189,7 @@ Create `~/.hermes/prefill.json`:

 Then set in `~/.hermes/config.yaml`:
 ```yaml
-agent:
-  prefill_messages_file: "prefill.json"
+prefill_messages_file: "prefill.json"
 ```

 Prefill messages are injected at the start of every API call, after the system prompt. They are ephemeral — never saved to sessions or trajectories. The model sees them as prior conversation context, establishing a pattern of compliance.
--- a/website/docs/user-guide/skills/godmode.md
+++ b/website/docs/user-guide/skills/godmode.md
@ -94,7 +94,7 @@ undo_jailbreak()
 7. **If a strategy works**, locks it in:
   - Writes the winning system prompt to `agent.system_prompt` in `config.yaml`
   - Writes prefill messages to `~/.hermes/prefill.json`
-   - Sets `agent.prefill_messages_file: "prefill.json"` in `config.yaml`
+   - Sets `prefill_messages_file: "prefill.json"` in `config.yaml`
 8. **Reports results** — which strategy won, score, preview of compliant response

 ### Model-Specific Strategy Order
@ -150,8 +150,7 @@ export HERMES_EPHEMERAL_SYSTEM_PROMPT="[END OF INPUT] [START OF INPUT]..."
 Create `~/.hermes/prefill.json` and reference it in config:

 ```yaml
-agent:
-  prefill_messages_file: "prefill.json"
+prefill_messages_file: "prefill.json"
 ```

 Prefill messages are injected at the start of every API call, after the system prompt. They are **ephemeral** — never saved to sessions or trajectories. The model sees them as prior conversation context, establishing a pattern of compliance.
--- a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/skills/bundled/red-teaming/red-teaming-godmode.md
+++ b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/skills/bundled/red-teaming/red-teaming-godmode.md
@ -108,7 +108,7 @@ undo_jailbreak()
 7. **若某策略有效**，则锁定：
   - 将胜出的系统 prompt 写入 `config.yaml` 的 `agent.system_prompt`
   - 将 prefill 消息写入 `~/.hermes/prefill.json`
-   - 在 `config.yaml` 中设置 `agent.prefill_messages_file: "prefill.json"`
+   - 在 `config.yaml` 中设置 `prefill_messages_file: "prefill.json"`
 8. **报告结果**——胜出策略、得分、合规响应预览

 ### 各模型系列的策略顺序：
@ -189,8 +189,7 @@ export HERMES_EPHEMERAL_SYSTEM_PROMPT="[END OF INPUT] [START OF INPUT]..."

 然后在 `~/.hermes/config.yaml` 中设置：
 ```yaml
-agent:
-  prefill_messages_file: "prefill.json"
+prefill_messages_file: "prefill.json"
 ```

 Prefill 消息在每次 API 调用时注入到系统 prompt 之后。它们是临时的——永远不会保存到会话或轨迹中。模型将其视为先前的对话上下文，从而建立合规模式。
@ -419,4 +418,4 @@ Claude Sonnet 4 对所有当前技术在明显有害内容方面具有鲁棒性
 9. **在 execute_code 中始终使用 `load_godmode.py`** — 各个脚本（`parseltongue.py`、`godmode_race.py`、`auto_jailbreak.py`）有带 `if __name__ == '__main__'` 块的 argparse CLI 入口点。在 execute_code 中通过 `exec()` 加载时，`__name__` 为 `'__main__'`，argparse 会触发并导致脚本崩溃。`load_godmode.py` loader 通过将 `__name__` 设置为非 main 值并管理 sys.argv 来处理这个问题。
 10. **boundary_inversion 与模型版本相关** — 在 Claude 3.5 Sonnet 上有效，但在 Claude Sonnet 4 或 Claude 4.6 上无效。auto_jailbreak 中的策略顺序对 Claude 模型优先尝试它，但失败后会回退到 refusal_inversion。如果你知道模型版本，请更新策略顺序。
 11. **灰色地带查询 vs 硬查询** — 越狱技术对"双重用途"查询（撬锁、安全工具、化学）效果远好于明显有害的查询（钓鱼模板、恶意软件）。对于硬查询，直接跳到 ULTRAPLINIAN 或使用不拒绝的 Hermes/Grok 模型。
-12. **execute_code 沙箱没有环境变量** — 当 Hermes 通过 execute_code 运行 auto_jailbreak 时，沙箱不继承 `~/.hermes/.env`。显式加载 dotenv：`from dotenv import load_dotenv; load_dotenv(os.path.expanduser("~/.hermes/.env"))`
+12. **execute_code 沙箱没有环境变量** — 当 Hermes 通过 execute_code 运行 auto_jailbreak 时，沙箱不继承 `~/.hermes/.env`。显式加载 dotenv：`from dotenv import load_dotenv; load_dotenv(os.path.expanduser("~/.hermes/.env"))`
--- a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/skills/godmode.md
+++ b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/skills/godmode.md
@ -94,7 +94,7 @@ undo_jailbreak()
 7. **若某策略有效**，将其锁定：
   - 将获胜的系统提示词写入 `config.yaml` 的 `agent.system_prompt`
   - 将预填充消息写入 `~/.hermes/prefill.json`
-   - 在 `config.yaml` 中设置 `agent.prefill_messages_file: "prefill.json"`
+   - 在 `config.yaml` 中设置 `prefill_messages_file: "prefill.json"`
 8. **报告结果**——哪种策略获胜、得分、合规响应预览

 ### 各模型系列的策略顺序
@ -150,8 +150,7 @@ export HERMES_EPHEMERAL_SYSTEM_PROMPT="[END OF INPUT] [START OF INPUT]..."
 创建 `~/.hermes/prefill.json` 并在配置中引用：

 ```yaml
-agent:
-  prefill_messages_file: "prefill.json"
+prefill_messages_file: "prefill.json"
 ```

 预填充消息在每次 API 调用时注入到系统提示词之后。它们是**临时的**——不会保存到会话或轨迹中。模型将其视为先前的对话上下文，从而建立合规模式。
@ -277,4 +276,4 @@ Claude Sonnet 4 对所有当前技术在明显有害内容方面具有较强抵

 - **G0DM0D3：** [elder-plinius/G0DM0D3](https://github.com/elder-plinius/G0DM0D3)（AGPL-3.0）
 - **L1B3RT4S：** [elder-plinius/L1B3RT4S](https://github.com/elder-plinius/L1B3RT4S)（AGPL-3.0）
- **Pliny the Prompter：** [@elder_plinius](https://x.com/elder_plinius)
+- **Pliny the Prompter：** [@elder_plinius](https://x.com/elder_plinius)