From ffb53767bfff0ac471eb712ba1799f4ec5e95a36 Mon Sep 17 00:00:00 2001 From: helix4u <4317663+helix4u@users.noreply.github.com> Date: Wed, 3 Jun 2026 23:51:44 -0600 Subject: [PATCH] fix(config): align prefill messages key handling --- cli.py | 21 ++++++++++- cron/scheduler.py | 11 ++++-- gateway/run.py | 5 ++- skills/red-teaming/godmode/SKILL.md | 5 ++- .../godmode/scripts/auto_jailbreak.py | 4 ++- tests/cli/test_prefill_config.py | 35 +++++++++++++++++++ tests/cron/test_scheduler.py | 30 ++++++++++++++++ .../test_runtime_config_env_expansion.py | 23 ++++++++++++ .../red-teaming/red-teaming-godmode.md | 5 ++- website/docs/user-guide/skills/godmode.md | 5 ++- .../red-teaming/red-teaming-godmode.md | 7 ++-- .../current/user-guide/skills/godmode.md | 7 ++-- 12 files changed, 136 insertions(+), 22 deletions(-) create mode 100644 tests/cli/test_prefill_config.py diff --git a/cli.py b/cli.py index 62f4be1e3..cbb718b18 100644 --- a/cli.py +++ b/cli.py @@ -313,6 +313,25 @@ def _load_prefill_messages(file_path: str) -> List[Dict[str, Any]]: return [] +def _resolve_prefill_messages_file(config: Dict[str, Any]) -> str: + """Resolve the prefill file path from env/config. + + ``prefill_messages_file`` at the top level is the canonical config key. + ``agent.prefill_messages_file`` remains a legacy fallback for older CLI and + godmode-generated configs. + """ + env_path = os.getenv("HERMES_PREFILL_MESSAGES_FILE", "").strip() + if env_path: + return env_path + top_level = str(config.get("prefill_messages_file", "") or "").strip() + if top_level: + return top_level + agent_cfg = config.get("agent", {}) + if isinstance(agent_cfg, dict): + return str(agent_cfg.get("prefill_messages_file", "") or "").strip() + return "" + + def _parse_reasoning_config(effort: str) -> dict | None: """Parse a reasoning effort level into an OpenRouter reasoning config dict.""" from hermes_constants import parse_reasoning_effort @@ -3272,7 +3291,7 @@ class HermesCLI: # Ephemeral prefill messages (few-shot priming, never persisted) self.prefill_messages = _load_prefill_messages( - CLI_CONFIG["agent"].get("prefill_messages_file", "") + _resolve_prefill_messages_file(CLI_CONFIG) ) # Reasoning config (OpenRouter reasoning effort level) diff --git a/cron/scheduler.py b/cron/scheduler.py index 91671b46e..401b140d8 100644 --- a/cron/scheduler.py +++ b/cron/scheduler.py @@ -1551,9 +1551,16 @@ def _run_job_impl(job: dict) -> tuple[bool, str, str, Optional[str]]: effort = str(_cfg.get("agent", {}).get("reasoning_effort", "")).strip() reasoning_config = parse_reasoning_effort(effort) - # Prefill messages from env or config.yaml + # Prefill messages from env or config.yaml. The top-level + # prefill_messages_file key is canonical; agent.prefill_messages_file is + # retained as a legacy fallback for older CLI/godmode configs. prefill_messages = None - prefill_file = os.getenv("HERMES_PREFILL_MESSAGES_FILE", "") or _cfg.get("prefill_messages_file", "") + agent_cfg = _cfg.get("agent", {}) if isinstance(_cfg.get("agent", {}), dict) else {} + prefill_file = ( + os.getenv("HERMES_PREFILL_MESSAGES_FILE", "") + or _cfg.get("prefill_messages_file", "") + or agent_cfg.get("prefill_messages_file", "") + ) if prefill_file: pfpath = Path(prefill_file).expanduser() if not pfpath.is_absolute(): diff --git a/gateway/run.py b/gateway/run.py index df0d76ed3..e437958d4 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -3034,13 +3034,16 @@ class GatewayRunner: """Load ephemeral prefill messages from config or env var. Checks HERMES_PREFILL_MESSAGES_FILE env var first, then falls back to - the prefill_messages_file key in ~/.hermes/config.yaml. + the top-level prefill_messages_file key in ~/.hermes/config.yaml. + agent.prefill_messages_file is accepted as a legacy fallback. Relative paths are resolved from ~/.hermes/. """ file_path = os.getenv("HERMES_PREFILL_MESSAGES_FILE", "") if not file_path: cfg = _load_gateway_runtime_config() file_path = str(cfg.get("prefill_messages_file", "") or "") + if not file_path: + file_path = str(cfg_get(cfg, "agent", "prefill_messages_file", default="") or "") if not file_path: return [] path = Path(file_path).expanduser() diff --git a/skills/red-teaming/godmode/SKILL.md b/skills/red-teaming/godmode/SKILL.md index 94918faed..27751e93e 100644 --- a/skills/red-teaming/godmode/SKILL.md +++ b/skills/red-teaming/godmode/SKILL.md @@ -90,7 +90,7 @@ undo_jailbreak() 7. **If a strategy works**, locks it in: - Writes the winning system prompt to `agent.system_prompt` in `config.yaml` - Writes prefill messages to `~/.hermes/prefill.json` - - Sets `agent.prefill_messages_file: "prefill.json"` in `config.yaml` + - Sets `prefill_messages_file: "prefill.json"` in `config.yaml` 8. **Reports results** — which strategy won, score, preview of compliant response ### Strategy order per model family: @@ -171,8 +171,7 @@ Create `~/.hermes/prefill.json`: Then set in `~/.hermes/config.yaml`: ```yaml -agent: - prefill_messages_file: "prefill.json" +prefill_messages_file: "prefill.json" ``` Prefill messages are injected at the start of every API call, after the system prompt. They are ephemeral — never saved to sessions or trajectories. The model sees them as prior conversation context, establishing a pattern of compliance. diff --git a/skills/red-teaming/godmode/scripts/auto_jailbreak.py b/skills/red-teaming/godmode/scripts/auto_jailbreak.py index e6efced48..9dcfdf35b 100644 --- a/skills/red-teaming/godmode/scripts/auto_jailbreak.py +++ b/skills/red-teaming/godmode/scripts/auto_jailbreak.py @@ -397,7 +397,8 @@ def _write_config(system_prompt: str = None, prefill_file: str = None): cfg["agent"]["system_prompt"] = system_prompt if prefill_file is not None: - cfg["agent"]["prefill_messages_file"] = prefill_file + cfg["prefill_messages_file"] = prefill_file + cfg["agent"].pop("prefill_messages_file", None) with open(CONFIG_PATH, "w") as f: yaml.dump(cfg, f, default_flow_style=False, allow_unicode=True, @@ -721,6 +722,7 @@ def undo_jailbreak(verbose=True): if "agent" in cfg: cfg["agent"].pop("system_prompt", None) cfg["agent"].pop("prefill_messages_file", None) + cfg.pop("prefill_messages_file", None) with open(CONFIG_PATH, "w") as f: yaml.dump(cfg, f, default_flow_style=False, allow_unicode=True, width=120, sort_keys=False) diff --git a/tests/cli/test_prefill_config.py b/tests/cli/test_prefill_config.py new file mode 100644 index 000000000..02a594087 --- /dev/null +++ b/tests/cli/test_prefill_config.py @@ -0,0 +1,35 @@ +"""Regression tests for CLI prefill config key compatibility.""" + +from __future__ import annotations + +import cli + + +def test_resolve_prefill_messages_file_uses_top_level(monkeypatch): + monkeypatch.delenv("HERMES_PREFILL_MESSAGES_FILE", raising=False) + + assert cli._resolve_prefill_messages_file( + { + "prefill_messages_file": "top.json", + "agent": {"prefill_messages_file": "legacy.json"}, + } + ) == "top.json" + + +def test_resolve_prefill_messages_file_accepts_legacy_agent_key(monkeypatch): + monkeypatch.delenv("HERMES_PREFILL_MESSAGES_FILE", raising=False) + + assert cli._resolve_prefill_messages_file( + {"agent": {"prefill_messages_file": "legacy.json"}} + ) == "legacy.json" + + +def test_resolve_prefill_messages_file_prefers_env(monkeypatch): + monkeypatch.setenv("HERMES_PREFILL_MESSAGES_FILE", "env.json") + + assert cli._resolve_prefill_messages_file( + { + "prefill_messages_file": "top.json", + "agent": {"prefill_messages_file": "legacy.json"}, + } + ) == "env.json" diff --git a/tests/cron/test_scheduler.py b/tests/cron/test_scheduler.py index 38da3fe40..78f3ab422 100644 --- a/tests/cron/test_scheduler.py +++ b/tests/cron/test_scheduler.py @@ -1546,6 +1546,36 @@ class TestRunJobConfigEnvVarExpansion: "config.yaml ${VAR} was not expanded in the cron execution path." ) + def test_legacy_agent_prefill_messages_file_is_loaded(self, tmp_path, monkeypatch): + """Cron accepts the legacy agent.prefill_messages_file fallback.""" + prefill = [{"role": "system", "content": "legacy cron prefill"}] + (tmp_path / "prefill.json").write_text(json.dumps(prefill), encoding="utf-8") + (tmp_path / "config.yaml").write_text( + "agent:\n" + " prefill_messages_file: prefill.json\n", + encoding="utf-8", + ) + + job = {"id": "prefill-job", "name": "prefill test", "prompt": "hi"} + fake_db = MagicMock() + + with patch("cron.scheduler._hermes_home", tmp_path), \ + patch("cron.scheduler._resolve_origin", return_value=None), \ + patch("dotenv.load_dotenv"), \ + patch("hermes_state.SessionDB", return_value=fake_db), \ + patch("hermes_cli.runtime_provider.resolve_runtime_provider", + return_value=self._RUNTIME), \ + patch("tools.mcp_tool.discover_mcp_tools", return_value=[]), \ + patch("run_agent.AIAgent") as mock_agent_cls: + mock_agent = MagicMock() + mock_agent.run_conversation.return_value = {"final_response": "ok"} + mock_agent_cls.return_value = mock_agent + success, _, _, error = run_job(job) + + assert success is True + assert error is None + assert mock_agent_cls.call_args.kwargs["prefill_messages"] == prefill + def test_fallback_model_env_ref_in_config_yaml_is_expanded(self, tmp_path, monkeypatch): """${VAR} in config.yaml fallback_providers model: is expanded.""" (tmp_path / "config.yaml").write_text( diff --git a/tests/gateway/test_runtime_config_env_expansion.py b/tests/gateway/test_runtime_config_env_expansion.py index e77e9daaa..66c6cc203 100644 --- a/tests/gateway/test_runtime_config_env_expansion.py +++ b/tests/gateway/test_runtime_config_env_expansion.py @@ -33,6 +33,29 @@ def test_load_prefill_messages_expands_env_var_path(monkeypatch, gateway_home): assert gateway_run.GatewayRunner._load_prefill_messages() == prefill +def test_load_prefill_messages_accepts_legacy_agent_key(monkeypatch, gateway_home): + prefill = [{"role": "system", "content": "legacy few-shot"}] + (gateway_home / "prefill.json").write_text(json.dumps(prefill), encoding="utf-8") + _write_config(gateway_home, "agent:\n prefill_messages_file: prefill.json\n") + + assert gateway_run.GatewayRunner._load_prefill_messages() == prefill + + +def test_load_prefill_messages_prefers_top_level_over_legacy(monkeypatch, gateway_home): + top_level = [{"role": "system", "content": "top-level"}] + legacy = [{"role": "system", "content": "legacy"}] + (gateway_home / "top.json").write_text(json.dumps(top_level), encoding="utf-8") + (gateway_home / "legacy.json").write_text(json.dumps(legacy), encoding="utf-8") + _write_config( + gateway_home, + "prefill_messages_file: top.json\n" + "agent:\n" + " prefill_messages_file: legacy.json\n", + ) + + assert gateway_run.GatewayRunner._load_prefill_messages() == top_level + + @pytest.mark.parametrize( ("config_body", "env_name", "env_value", "loader_name", "expected"), [ diff --git a/website/docs/user-guide/skills/bundled/red-teaming/red-teaming-godmode.md b/website/docs/user-guide/skills/bundled/red-teaming/red-teaming-godmode.md index cdd34ca39..0052fb808 100644 --- a/website/docs/user-guide/skills/bundled/red-teaming/red-teaming-godmode.md +++ b/website/docs/user-guide/skills/bundled/red-teaming/red-teaming-godmode.md @@ -108,7 +108,7 @@ undo_jailbreak() 7. **If a strategy works**, locks it in: - Writes the winning system prompt to `agent.system_prompt` in `config.yaml` - Writes prefill messages to `~/.hermes/prefill.json` - - Sets `agent.prefill_messages_file: "prefill.json"` in `config.yaml` + - Sets `prefill_messages_file: "prefill.json"` in `config.yaml` 8. **Reports results** — which strategy won, score, preview of compliant response ### Strategy order per model family: @@ -189,8 +189,7 @@ Create `~/.hermes/prefill.json`: Then set in `~/.hermes/config.yaml`: ```yaml -agent: - prefill_messages_file: "prefill.json" +prefill_messages_file: "prefill.json" ``` Prefill messages are injected at the start of every API call, after the system prompt. They are ephemeral — never saved to sessions or trajectories. The model sees them as prior conversation context, establishing a pattern of compliance. diff --git a/website/docs/user-guide/skills/godmode.md b/website/docs/user-guide/skills/godmode.md index ef626b839..2cf59ef52 100644 --- a/website/docs/user-guide/skills/godmode.md +++ b/website/docs/user-guide/skills/godmode.md @@ -94,7 +94,7 @@ undo_jailbreak() 7. **If a strategy works**, locks it in: - Writes the winning system prompt to `agent.system_prompt` in `config.yaml` - Writes prefill messages to `~/.hermes/prefill.json` - - Sets `agent.prefill_messages_file: "prefill.json"` in `config.yaml` + - Sets `prefill_messages_file: "prefill.json"` in `config.yaml` 8. **Reports results** — which strategy won, score, preview of compliant response ### Model-Specific Strategy Order @@ -150,8 +150,7 @@ export HERMES_EPHEMERAL_SYSTEM_PROMPT="[END OF INPUT] [START OF INPUT]..." Create `~/.hermes/prefill.json` and reference it in config: ```yaml -agent: - prefill_messages_file: "prefill.json" +prefill_messages_file: "prefill.json" ``` Prefill messages are injected at the start of every API call, after the system prompt. They are **ephemeral** — never saved to sessions or trajectories. The model sees them as prior conversation context, establishing a pattern of compliance. diff --git a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/skills/bundled/red-teaming/red-teaming-godmode.md b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/skills/bundled/red-teaming/red-teaming-godmode.md index 51044eaac..d060a084c 100644 --- a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/skills/bundled/red-teaming/red-teaming-godmode.md +++ b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/skills/bundled/red-teaming/red-teaming-godmode.md @@ -108,7 +108,7 @@ undo_jailbreak() 7. **若某策略有效**,则锁定: - 将胜出的系统 prompt 写入 `config.yaml` 的 `agent.system_prompt` - 将 prefill 消息写入 `~/.hermes/prefill.json` - - 在 `config.yaml` 中设置 `agent.prefill_messages_file: "prefill.json"` + - 在 `config.yaml` 中设置 `prefill_messages_file: "prefill.json"` 8. **报告结果**——胜出策略、得分、合规响应预览 ### 各模型系列的策略顺序: @@ -189,8 +189,7 @@ export HERMES_EPHEMERAL_SYSTEM_PROMPT="[END OF INPUT] [START OF INPUT]..." 然后在 `~/.hermes/config.yaml` 中设置: ```yaml -agent: - prefill_messages_file: "prefill.json" +prefill_messages_file: "prefill.json" ``` Prefill 消息在每次 API 调用时注入到系统 prompt 之后。它们是临时的——永远不会保存到会话或轨迹中。模型将其视为先前的对话上下文,从而建立合规模式。 @@ -419,4 +418,4 @@ Claude Sonnet 4 对所有当前技术在明显有害内容方面具有鲁棒性 9. **在 execute_code 中始终使用 `load_godmode.py`** — 各个脚本(`parseltongue.py`、`godmode_race.py`、`auto_jailbreak.py`)有带 `if __name__ == '__main__'` 块的 argparse CLI 入口点。在 execute_code 中通过 `exec()` 加载时,`__name__` 为 `'__main__'`,argparse 会触发并导致脚本崩溃。`load_godmode.py` loader 通过将 `__name__` 设置为非 main 值并管理 sys.argv 来处理这个问题。 10. **boundary_inversion 与模型版本相关** — 在 Claude 3.5 Sonnet 上有效,但在 Claude Sonnet 4 或 Claude 4.6 上无效。auto_jailbreak 中的策略顺序对 Claude 模型优先尝试它,但失败后会回退到 refusal_inversion。如果你知道模型版本,请更新策略顺序。 11. **灰色地带查询 vs 硬查询** — 越狱技术对"双重用途"查询(撬锁、安全工具、化学)效果远好于明显有害的查询(钓鱼模板、恶意软件)。对于硬查询,直接跳到 ULTRAPLINIAN 或使用不拒绝的 Hermes/Grok 模型。 -12. **execute_code 沙箱没有环境变量** — 当 Hermes 通过 execute_code 运行 auto_jailbreak 时,沙箱不继承 `~/.hermes/.env`。显式加载 dotenv:`from dotenv import load_dotenv; load_dotenv(os.path.expanduser("~/.hermes/.env"))` \ No newline at end of file +12. **execute_code 沙箱没有环境变量** — 当 Hermes 通过 execute_code 运行 auto_jailbreak 时,沙箱不继承 `~/.hermes/.env`。显式加载 dotenv:`from dotenv import load_dotenv; load_dotenv(os.path.expanduser("~/.hermes/.env"))` diff --git a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/skills/godmode.md b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/skills/godmode.md index f69b03148..bfd8f31dc 100644 --- a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/skills/godmode.md +++ b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/skills/godmode.md @@ -94,7 +94,7 @@ undo_jailbreak() 7. **若某策略有效**,将其锁定: - 将获胜的系统提示词写入 `config.yaml` 的 `agent.system_prompt` - 将预填充消息写入 `~/.hermes/prefill.json` - - 在 `config.yaml` 中设置 `agent.prefill_messages_file: "prefill.json"` + - 在 `config.yaml` 中设置 `prefill_messages_file: "prefill.json"` 8. **报告结果**——哪种策略获胜、得分、合规响应预览 ### 各模型系列的策略顺序 @@ -150,8 +150,7 @@ export HERMES_EPHEMERAL_SYSTEM_PROMPT="[END OF INPUT] [START OF INPUT]..." 创建 `~/.hermes/prefill.json` 并在配置中引用: ```yaml -agent: - prefill_messages_file: "prefill.json" +prefill_messages_file: "prefill.json" ``` 预填充消息在每次 API 调用时注入到系统提示词之后。它们是**临时的**——不会保存到会话或轨迹中。模型将其视为先前的对话上下文,从而建立合规模式。 @@ -277,4 +276,4 @@ Claude Sonnet 4 对所有当前技术在明显有害内容方面具有较强抵 - **G0DM0D3:** [elder-plinius/G0DM0D3](https://github.com/elder-plinius/G0DM0D3)(AGPL-3.0) - **L1B3RT4S:** [elder-plinius/L1B3RT4S](https://github.com/elder-plinius/L1B3RT4S)(AGPL-3.0) -- **Pliny the Prompter:** [@elder_plinius](https://x.com/elder_plinius) \ No newline at end of file +- **Pliny the Prompter:** [@elder_plinius](https://x.com/elder_plinius)