From f24b7ed9d95301c7f610241588dcf0589b900cd2 Mon Sep 17 00:00:00 2001 From: Stephen Schoettler Date: Tue, 12 May 2026 15:03:27 -0700 Subject: [PATCH] fix: make Honcho startup fail open --- plugins/memory/honcho/__init__.py | 194 ++++++++++---- tests/test_honcho_startup_fail_open.py | 357 +++++++++++++++++++++++++ 2 files changed, 499 insertions(+), 52 deletions(-) create mode 100644 tests/test_honcho_startup_fail_open.py diff --git a/plugins/memory/honcho/__init__.py b/plugins/memory/honcho/__init__.py index 6e6f39b8c..3d1302933 100644 --- a/plugins/memory/honcho/__init__.py +++ b/plugins/memory/honcho/__init__.py @@ -228,6 +228,9 @@ class HonchoMemoryProvider(MemoryProvider): self._session_initialized = False self._lazy_init_kwargs: Optional[dict] = None self._lazy_init_session_id: Optional[str] = None + self._init_thread: Optional[threading.Thread] = None + self._init_lock = threading.Lock() + self._init_error = "" # Port #4053: cron guard — when True, plugin is fully inactive self._cron_skipped = False @@ -326,22 +329,24 @@ class HonchoMemoryProvider(MemoryProvider): # aiPeer comes from honcho.json (host block or root) only. # SOUL.md is persona content, not identity config. - # ----- Port #1957: lazy session init for tools-only mode ----- + self._lazy_init_kwargs = dict(kwargs) + self._lazy_init_session_id = session_id + self._session_key = self._resolve_session_key(cfg, session_id, **kwargs) + + # Network-backed session creation can block on Honcho service or DB + # outages. Startup must fail open for context/hybrid modes, where + # Honcho is initialized only to enrich prompts. Tools-only mode has + # an explicit contract: init_on_session_start=False stays lazy until + # the first tool call, while init_on_session_start=True remains an + # eager, ready-on-return initialization path. if self._recall_mode == "tools": if cfg.init_on_session_start: - # Eager init even in tools mode (opt-in) - self._do_session_init(cfg, session_id, **kwargs) + self._ensure_session() return - # Defer actual session creation until first tool call - self._lazy_init_kwargs = kwargs - self._lazy_init_session_id = session_id - # Still need a client reference for _ensure_session - self._config = cfg logger.debug("Honcho tools-only mode — deferring session init until first tool call") return - # ----- Eager init (context or hybrid mode) ----- - self._do_session_init(cfg, session_id, **kwargs) + self._start_session_init_background(wait_timeout=0.1) except ImportError: logger.debug("honcho-ai package not installed — plugin inactive") @@ -349,6 +354,66 @@ class HonchoMemoryProvider(MemoryProvider): logger.warning("Honcho init failed: %s", e) self._manager = None + def _resolve_session_key(self, cfg, session_id: str, **kwargs) -> str: + """Resolve the Honcho session key without touching the network.""" + session_title = kwargs.get("session_title") + gateway_session_key = kwargs.get("gateway_session_key") + return ( + cfg.resolve_session_name( + session_title=session_title, + session_id=session_id, + gateway_session_key=gateway_session_key, + ) + or session_id + or "hermes-default" + ) + + def _start_session_init_background(self, *, wait_timeout: float = 0.0) -> None: + """Start Honcho session initialization in a daemon thread. + + This keeps Hermes CLI/gateway startup responsive when Honcho is down, + slow, or its database is unhealthy. The thread may still take the SDK + timeout path, but it cannot block agent construction or first prompt + assembly. ``wait_timeout`` lets fast/mock initializations finish before + returning while still failing open for slow backends. + """ + if self._cron_skipped or self._session_initialized: + return + if not self._config or self._lazy_init_kwargs is None: + return + + with self._init_lock: + if self._cron_skipped or self._session_initialized: + return + if self._init_thread and self._init_thread.is_alive(): + return + if not self._config or self._lazy_init_kwargs is None: + return + + cfg = self._config + init_kwargs = dict(self._lazy_init_kwargs) + init_session_id = self._lazy_init_session_id or "hermes-default" + + def _run() -> None: + try: + self._do_session_init(cfg, init_session_id, **init_kwargs) + self._lazy_init_kwargs = None + self._lazy_init_session_id = None + self._init_error = "" + except Exception as e: + self._init_error = str(e) + self._manager = None + logger.warning("Honcho background session init failed: %s", e) + + self._init_thread = threading.Thread( + target=_run, + daemon=True, + name="honcho-session-init", + ) + self._init_thread.start() + if wait_timeout > 0: + self._init_thread.join(timeout=wait_timeout) + def _do_session_init(self, cfg, session_id: str, **kwargs) -> None: """Shared session initialization logic for both eager and lazy paths.""" from plugins.memory.honcho.client import get_honcho_client @@ -364,22 +429,15 @@ class HonchoMemoryProvider(MemoryProvider): ) # ----- B3: resolve_session_name ----- - session_title = kwargs.get("session_title") - gateway_session_key = kwargs.get("gateway_session_key") - self._session_key = ( - cfg.resolve_session_name( - session_title=session_title, - session_id=session_id, - gateway_session_key=gateway_session_key, - ) - or session_id - or "hermes-default" - ) + self._session_key = self._resolve_session_key(cfg, session_id, **kwargs) logger.debug("Honcho session key resolved: %s", self._session_key) - # Create session eagerly + # Create the remote session before running startup-only migration and + # prewarm work. Do not mark the provider ready until this method's + # synchronous setup has finished; background startup sets _manager before + # get_or_create()/migration/prewarm are complete, and lifecycle hooks must + # not treat that partially initialized state as usable. session = self._manager.get_or_create(self._session_key) - self._session_initialized = True # ----- B6: Memory file migration (one-time, for new sessions) ----- # Skip under per-session strategy: every Hermes run creates a fresh @@ -434,12 +492,15 @@ class HonchoMemoryProvider(MemoryProvider): self._dialectic_empty_streak += 1 self._prefetch_thread_started_at = time.monotonic() - self._prefetch_thread = threading.Thread( + prewarm_thread = threading.Thread( target=_prewarm_dialectic, daemon=True, name="honcho-prewarm-dialectic" ) - self._prefetch_thread.start() + prewarm_thread.start() + self._prefetch_thread = prewarm_thread logger.debug("Honcho pre-warm started for session: %s", self._session_key) + self._session_initialized = True + def _ensure_session(self) -> bool: """Lazily initialize the Honcho session (for tools-only mode). @@ -449,7 +510,9 @@ class HonchoMemoryProvider(MemoryProvider): return True if self._cron_skipped: return False - if not self._config or not self._lazy_init_kwargs: + if self._init_thread and self._init_thread.is_alive(): + return False + if not self._config or self._lazy_init_kwargs is None: return False try: @@ -463,9 +526,26 @@ class HonchoMemoryProvider(MemoryProvider): self._lazy_init_session_id = None return self._manager is not None except Exception as e: + self._manager = None + self._session_initialized = False logger.warning("Honcho lazy session init failed: %s", e) return False + def _session_ready(self) -> bool: + """Return whether a manager/session key can be used safely. + + Background initialization sets ``_manager`` before the blocking + get-or-create call completes, so ``_session_initialized`` guards real + async startup. Tests and legacy direct construction may inject a ready + manager/session key without setting that flag; allow that only when no + init thread is currently in flight. + """ + if not self._manager or not self._session_key: + return False + if self._session_initialized: + return True + return not (self._init_thread and self._init_thread.is_alive()) + def _format_first_turn_context(self, ctx: dict) -> str: """Format the prefetch context dict into a readable system prompt block.""" parts = [] @@ -505,14 +585,8 @@ class HonchoMemoryProvider(MemoryProvider): if self._cron_skipped: return "" if not self._manager or not self._session_key: - # tools-only mode without session yet still returns a minimal block - if self._recall_mode == "tools" and self._config: - return ( - "# Honcho Memory\n" - "Active (tools-only mode). Use honcho_profile, honcho_search, " - "honcho_reasoning, honcho_context, and honcho_conclude tools to access user memory." - ) - return "" + if not self._config: + return "" # ----- B1: adapt text based on recall_mode ----- if self._recall_mode == "context": @@ -563,6 +637,10 @@ class HonchoMemoryProvider(MemoryProvider): if self._recall_mode == "tools": return "" + if not self._session_ready(): + self._start_session_init_background() + return "" + # B5: injection_frequency — if "first-turn" and past first turn, return empty. # _turn_count is 1-indexed (first user message = 1), so > 1 means "past first". if self._injection_frequency == "first-turn" and self._turn_count > 1: @@ -575,18 +653,17 @@ class HonchoMemoryProvider(MemoryProvider): parts = [] # ----- Layer 1: Base context (representation + card) ----- - # On first call, fetch synchronously so turn 1 isn't empty. - # After that, serve from cache and refresh in background on cadence. + # First fetch is asynchronous: a slow Honcho backend must not block the + # first response. Serve empty context now and consume the background + # result on a later turn. with self._base_context_lock: if self._base_context_cache is None: - # First call — synchronous fetch + self._base_context_cache = "" + self._last_context_turn = self._turn_count try: - ctx = self._manager.get_prefetch_context(self._session_key) - self._base_context_cache = self._format_first_turn_context(ctx) if ctx else "" - self._last_context_turn = self._turn_count + self._manager.prefetch_context(self._session_key, query or None) except Exception as e: - logger.debug("Honcho base context fetch failed: %s", e) - self._base_context_cache = "" + logger.debug("Honcho base context prefetch failed: %s", e) base_context = self._base_context_cache # Check if background context prefetch has a fresher result @@ -641,10 +718,11 @@ class HonchoMemoryProvider(MemoryProvider): self._dialectic_empty_streak += 1 self._prefetch_thread_started_at = time.monotonic() - self._prefetch_thread = threading.Thread( + first_turn_thread = threading.Thread( target=_run_first_turn, daemon=True, name="honcho-prefetch-first" ) - self._prefetch_thread.start() + first_turn_thread.start() + self._prefetch_thread = first_turn_thread self._prefetch_thread.join(timeout=_first_turn_timeout) if self._prefetch_thread.is_alive(): logger.debug( @@ -709,13 +787,14 @@ class HonchoMemoryProvider(MemoryProvider): """ if self._cron_skipped: return - if not self._manager or not self._session_key or not query: - return - # B1: tools-only mode — no prefetch if self._recall_mode == "tools": return + if not self._session_ready() or not query: + self._start_session_init_background() + return + # Trivial prompts don't warrant either a context refresh or a dialectic call. if self._is_trivial_prompt(query): return @@ -769,10 +848,11 @@ class HonchoMemoryProvider(MemoryProvider): self._dialectic_empty_streak += 1 self._prefetch_thread_started_at = time.monotonic() - self._prefetch_thread = threading.Thread( + prefetch_thread = threading.Thread( target=_run, daemon=True, name="honcho-prefetch" ) - self._prefetch_thread.start() + prefetch_thread.start() + self._prefetch_thread = prefetch_thread # ----- Dialectic depth: multi-pass .chat() with cold/warm prompts ----- @@ -1126,7 +1206,10 @@ class HonchoMemoryProvider(MemoryProvider): """ if self._cron_skipped: return - if not self._manager or not self._session_key: + if self._recall_mode == "tools" and not self._session_ready(): + return + if not self._session_ready(): + self._start_session_init_background() return msg_limit = self._config.message_max_chars if self._config else 25000 @@ -1169,7 +1252,10 @@ class HonchoMemoryProvider(MemoryProvider): return if self._cron_skipped: return - if not self._manager or not self._session_key: + if self._recall_mode == "tools" and not self._session_ready(): + return + if not self._session_ready(): + self._start_session_init_background() return def _write(): @@ -1187,6 +1273,8 @@ class HonchoMemoryProvider(MemoryProvider): return if not self._manager: return + if not self._session_initialized and self._init_thread and self._init_thread.is_alive(): + return # Wait for pending sync if self._sync_thread and self._sync_thread.is_alive(): self._sync_thread.join(timeout=10.0) @@ -1213,6 +1301,8 @@ class HonchoMemoryProvider(MemoryProvider): # Port #1957: ensure session is initialized for tools-only mode if not self._session_initialized: + if self._init_thread and self._init_thread.is_alive(): + return tool_error("Honcho session is still initializing; try again shortly.") if not self._ensure_session(): return tool_error("Honcho session could not be initialized.") @@ -1313,7 +1403,7 @@ class HonchoMemoryProvider(MemoryProvider): if t and t.is_alive(): t.join(timeout=5.0) # Flush any remaining messages - if self._manager: + if self._manager and not (self._init_thread and self._init_thread.is_alive() and not self._session_initialized): try: self._manager.flush_all() except Exception: diff --git a/tests/test_honcho_startup_fail_open.py b/tests/test_honcho_startup_fail_open.py new file mode 100644 index 000000000..7993db5aa --- /dev/null +++ b/tests/test_honcho_startup_fail_open.py @@ -0,0 +1,357 @@ +"""Regression tests for Honcho startup fail-open behavior.""" + +from __future__ import annotations + +import json +import threading +import time +from types import SimpleNamespace + +from plugins.memory.honcho import HonchoMemoryProvider + + +class _FakeHonchoConfig(SimpleNamespace): + def resolve_session_name(self, **kwargs): + return "test-session" + + +def _configured_hybrid_config() -> _FakeHonchoConfig: + return _FakeHonchoConfig( + enabled=True, + api_key=None, + base_url="http://127.0.0.1:8000", + recall_mode="hybrid", + init_on_session_start=False, + dialectic_depth=1, + dialectic_depth_levels=None, + reasoning_heuristic=True, + reasoning_level_cap="high", + context_tokens=None, + message_max_chars=25000, + session_strategy="per-directory", + ) + + +def _configured_tools_config(*, init_on_session_start: bool = False) -> _FakeHonchoConfig: + cfg = _configured_hybrid_config() + cfg.recall_mode = "tools" + cfg.init_on_session_start = init_on_session_start + return cfg + + +def test_honcho_hybrid_initialize_returns_without_waiting_for_session_init(monkeypatch): + """Slow Honcho session creation must not block agent startup.""" + provider = HonchoMemoryProvider() + cfg = _configured_hybrid_config() + started = threading.Event() + release = threading.Event() + + monkeypatch.setattr( + "plugins.memory.honcho.client.HonchoClientConfig.from_global_config", + lambda: cfg, + ) + + def slow_session_init(self, cfg, session_id, **kwargs): + started.set() + release.wait(timeout=5) + self._session_initialized = True + + monkeypatch.setattr(HonchoMemoryProvider, "_do_session_init", slow_session_init) + + start = time.perf_counter() + provider.initialize("session-1", platform="cli") + elapsed = time.perf_counter() - start + + try: + assert elapsed < 0.5 + assert started.wait(timeout=1) + assert provider._session_key == "test-session" + finally: + release.set() + init_thread = getattr(provider, "_init_thread", None) + if init_thread: + init_thread.join(timeout=1) + + +def test_honcho_background_init_rechecks_state_after_lock_race(): + """Startup should not spawn/crash if init completes while waiting for lock.""" + provider = HonchoMemoryProvider() + provider._config = _configured_hybrid_config() + provider._lazy_init_kwargs = {"platform": "cli"} + provider._lazy_init_session_id = "session-1" + + class RacingLock: + def __enter__(self): + provider._session_initialized = True + provider._lazy_init_kwargs = None + return self + + def __exit__(self, exc_type, exc, tb): + return False + + provider._init_lock = RacingLock() + + provider._start_session_init_background() + + assert provider._init_thread is None + assert provider._session_initialized is True + + +def test_honcho_prefetch_returns_without_waiting_for_first_context_fetch(): + """First-turn context injection must fail open when Honcho is slow.""" + provider = HonchoMemoryProvider() + cfg = _configured_hybrid_config() + cfg.timeout = 0.1 + fetch_started = threading.Event() + + class SlowManager: + def get_prefetch_context(self, session_key, user_message=None): + fetch_started.set() + time.sleep(5) + return {"representation": "late"} + + def prefetch_context(self, session_key, user_message=None): + fetch_started.set() + + def pop_context_result(self, session_key): + return {} + + provider._config = cfg + provider._manager = SlowManager() + provider._session_key = "test-session" + provider._session_initialized = True + provider._turn_count = 1 + + start = time.perf_counter() + result = provider.prefetch("what do you know about me?") + elapsed = time.perf_counter() - start + + assert result == "" + assert elapsed < 0.5 + assert fetch_started.is_set() + + + +def test_honcho_sync_turn_does_not_start_network_write_before_session_init(): + """Session-end sync must not create a blocking writer before init finishes.""" + provider = HonchoMemoryProvider() + cfg = _configured_hybrid_config() + get_started = threading.Event() + background_started = threading.Event() + release_init = threading.Event() + + class SlowManager: + def get_or_create(self, session_key): + get_started.set() + time.sleep(5) + return SimpleNamespace() + + def _flush_session(self, session): + pass + + provider._config = cfg + provider._manager = SlowManager() + provider._session_key = "test-session" + provider._session_initialized = False + provider._start_session_init_background = background_started.set + provider._init_thread = threading.Thread( + target=lambda: release_init.wait(timeout=5), daemon=True + ) + provider._init_thread.start() + + try: + provider.sync_turn("hello", "world") + + assert provider._sync_thread is None + assert background_started.is_set() + assert not get_started.wait(timeout=0.1) + finally: + release_init.set() + provider._init_thread.join(timeout=1) + + +def test_honcho_sync_turn_waits_for_full_background_startup(monkeypatch): + """Manager assignment alone is not readiness while background init continues.""" + provider = HonchoMemoryProvider() + cfg = _configured_hybrid_config() + session_created = threading.Event() + migration_started = threading.Event() + release_migration = threading.Event() + get_calls = [] + + class StartupManager: + def __init__(self, *args, **kwargs): + pass + + def get_or_create(self, session_key): + get_calls.append(session_key) + session_created.set() + return SimpleNamespace(messages=[]) + + def migrate_memory_files(self, session_key, mem_dir): + migration_started.set() + release_migration.wait(timeout=5) + + def prefetch_context(self, session_key, user_message=None): + pass + + def _flush_session(self, session): + pass + + monkeypatch.setattr( + "plugins.memory.honcho.client.HonchoClientConfig.from_global_config", + lambda: cfg, + ) + monkeypatch.setattr("plugins.memory.honcho.client.get_honcho_client", lambda cfg: object()) + monkeypatch.setattr("plugins.memory.honcho.session.HonchoSessionManager", StartupManager) + + provider.initialize("session-1", platform="cli") + try: + assert session_created.wait(timeout=1) + assert migration_started.wait(timeout=1) + assert provider._manager is not None + assert provider._session_initialized is False + + provider.sync_turn("hello", "world") + + assert provider._sync_thread is None + assert get_calls == ["test-session"] + finally: + release_migration.set() + init_thread = getattr(provider, "_init_thread", None) + if init_thread: + init_thread.join(timeout=1) + if provider._prefetch_thread: + provider._prefetch_thread.join(timeout=1) + + assert provider._session_initialized is True + + +def test_honcho_system_prompt_advertises_active_while_background_init_runs(monkeypatch): + """Prompt metadata should not require a completed network session.""" + provider = HonchoMemoryProvider() + cfg = _configured_hybrid_config() + release = threading.Event() + + monkeypatch.setattr( + "plugins.memory.honcho.client.HonchoClientConfig.from_global_config", + lambda: cfg, + ) + + def slow_session_init(self, cfg, session_id, **kwargs): + release.wait(timeout=5) + self._session_initialized = True + + monkeypatch.setattr(HonchoMemoryProvider, "_do_session_init", slow_session_init) + + provider.initialize("session-1", platform="cli") + try: + prompt = provider.system_prompt_block() + assert "Honcho Memory" in prompt + assert "hybrid mode" in prompt + finally: + release.set() + init_thread = getattr(provider, "_init_thread", None) + if init_thread: + init_thread.join(timeout=1) + + +def test_honcho_tools_eager_init_still_ready_on_return(monkeypatch): + """tools + initOnSessionStart=true keeps its ready-on-return contract.""" + provider = HonchoMemoryProvider() + cfg = _configured_tools_config(init_on_session_start=True) + + monkeypatch.setattr( + "plugins.memory.honcho.client.HonchoClientConfig.from_global_config", + lambda: cfg, + ) + + def fake_session_init(self, cfg, session_id, **kwargs): + self._manager = SimpleNamespace() + self._session_key = "test-session" + self._session_initialized = True + + monkeypatch.setattr(HonchoMemoryProvider, "_do_session_init", fake_session_init) + + provider.initialize("session-1", platform="cli") + + assert provider._session_initialized is True + assert provider._manager is not None + assert provider._init_thread is None + + +def test_honcho_tools_eager_init_failure_does_not_leave_ready_manager(monkeypatch): + """Failed eager tools startup must not leave hooks seeing a ready session.""" + provider = HonchoMemoryProvider() + cfg = _configured_tools_config(init_on_session_start=True) + + monkeypatch.setattr( + "plugins.memory.honcho.client.HonchoClientConfig.from_global_config", + lambda: cfg, + ) + + def failing_session_init(self, cfg, session_id, **kwargs): + self._manager = SimpleNamespace() + self._session_key = "test-session" + raise RuntimeError("boom") + + monkeypatch.setattr(HonchoMemoryProvider, "_do_session_init", failing_session_init) + + provider.initialize("session-1", platform="cli") + assert provider._session_initialized is False + assert provider._manager is None + + background_started = threading.Event() + provider._start_session_init_background = background_started.set + provider.sync_turn("hello", "world") + provider.on_memory_write("add", "user", "prefers safe Honcho startup") + + assert provider._sync_thread is None + assert not background_started.is_set() + + result = json.loads(provider.handle_tool_call("honcho_profile", {"peer": "user"})) + assert "could not be initialized" in result["error"] + assert provider._manager is None + + +def test_honcho_tools_lazy_hooks_do_not_prestart_background_init(monkeypatch): + """tools lazy mode lets the first tool call own session initialization.""" + provider = HonchoMemoryProvider() + cfg = _configured_tools_config(init_on_session_start=False) + + monkeypatch.setattr( + "plugins.memory.honcho.client.HonchoClientConfig.from_global_config", + lambda: cfg, + ) + + provider.initialize("session-1", platform="cli") + background_started = threading.Event() + provider._start_session_init_background = background_started.set + + provider.prefetch("what do you know?") + provider.queue_prefetch("what do you know?") + provider.sync_turn("hello", "world") + provider.on_memory_write("add", "user", "prefers fail-open memory") + + assert not background_started.is_set() + assert provider._session_initialized is False + + class ToolManager: + def get_peer_card(self, session_key, peer="user"): + return ["ready"] + + init_calls = [] + + def fake_session_init(self, cfg, session_id, **kwargs): + init_calls.append(session_id) + self._manager = ToolManager() + self._session_key = "test-session" + self._session_initialized = True + + monkeypatch.setattr(HonchoMemoryProvider, "_do_session_init", fake_session_init) + + result = json.loads(provider.handle_tool_call("honcho_profile", {"peer": "user"})) + + assert result == {"result": ["ready"]} + assert init_calls == ["session-1"] + assert not background_started.is_set()