diff --git a/hermes_cli/banner.py b/hermes_cli/banner.py index c91b2f728..f25d03d2a 100644 --- a/hermes_cli/banner.py +++ b/hermes_cli/banner.py @@ -12,14 +12,16 @@ import threading import time from pathlib import Path from hermes_constants import get_hermes_home -from typing import Dict, List, Optional +from typing import TYPE_CHECKING, Dict, List, Optional -from rich.console import Console -from rich.panel import Panel -from rich.table import Table - -from prompt_toolkit import print_formatted_text as _pt_print -from prompt_toolkit.formatted_text import ANSI as _PT_ANSI +# rich and prompt_toolkit are imported lazily (inside the functions that use +# them) rather than at module level. Importing this module is on the TUI +# gateway's critical startup path purely to reach the lightweight update-check +# helpers (``prefetch_update_check``); pulling rich.console + prompt_toolkit +# eagerly added ~50ms of wasted imports before ``gateway.ready`` could fire. +# Keep the type-only reference available to checkers without the runtime cost. +if TYPE_CHECKING: + from rich.console import Console logger = logging.getLogger(__name__) @@ -36,6 +38,8 @@ _RST = "\033[0m" def cprint(text: str): """Print ANSI-colored text through prompt_toolkit's renderer.""" + from prompt_toolkit import print_formatted_text as _pt_print + from prompt_toolkit.formatted_text import ANSI as _PT_ANSI _pt_print(_PT_ANSI(text)) @@ -471,7 +475,7 @@ def _display_toolset_name(toolset_name: str) -> str: ) -def build_welcome_banner(console: Console, model: str, cwd: str, +def build_welcome_banner(console: "Console", model: str, cwd: str, tools: List[dict] = None, enabled_toolsets: List[str] = None, session_id: str = None, @@ -490,6 +494,8 @@ def build_welcome_banner(console: Console, model: str, cwd: str, context_length: Model's context window size in tokens. """ from model_tools import check_tool_availability, TOOLSET_REQUIREMENTS + from rich.panel import Panel + from rich.table import Table if get_toolset_for_tool is None: from model_tools import get_toolset_for_tool diff --git a/tests/tui_gateway/test_wait_for_mcp_discovery.py b/tests/tui_gateway/test_wait_for_mcp_discovery.py new file mode 100644 index 000000000..ab5bb5f6d --- /dev/null +++ b/tests/tui_gateway/test_wait_for_mcp_discovery.py @@ -0,0 +1,78 @@ +"""Tests for tui_gateway.entry.wait_for_mcp_discovery (PR #35245). + +MCP tool discovery runs in a background daemon thread so a slow/dead server +can't freeze ``gateway.ready``. The agent snapshots its tool list once at +build time and never re-reads it, so ``_make_agent`` briefly joins the +discovery thread before building — bounded, so a dead server can't re-introduce +the startup hang, and a no-op once discovery has finished. +""" + +import threading +import time + +import tui_gateway.entry as entry + + +def _restore_thread_slot(saved): + entry._mcp_discovery_thread = saved + + +def test_no_thread_is_noop(): + """When no discovery thread was started (the common no-MCP case), the + helper returns immediately and never blocks.""" + saved = entry._mcp_discovery_thread + try: + entry._mcp_discovery_thread = None + start = time.monotonic() + entry.wait_for_mcp_discovery(timeout=5.0) + assert time.monotonic() - start < 0.1 + finally: + _restore_thread_slot(saved) + + +def test_already_finished_thread_is_noop(): + """A thread that has already finished is not joined-on (dead thread).""" + saved = entry._mcp_discovery_thread + try: + t = threading.Thread(target=lambda: None, daemon=True) + t.start() + t.join() # ensure it's finished + entry._mcp_discovery_thread = t + start = time.monotonic() + entry.wait_for_mcp_discovery(timeout=5.0) + assert time.monotonic() - start < 0.1 + finally: + _restore_thread_slot(saved) + + +def test_fast_thread_is_joined(): + """A reachable-but-still-connecting (fast) server lands before the agent + snapshots tools — the helper waits for it to finish.""" + saved = entry._mcp_discovery_thread + try: + t = threading.Thread(target=lambda: time.sleep(0.05), daemon=True) + t.start() + entry._mcp_discovery_thread = t + entry.wait_for_mcp_discovery(timeout=1.0) + assert not t.is_alive() # joined to completion + finally: + _restore_thread_slot(saved) + + +def test_hung_thread_is_bounded_by_timeout(): + """A slow/dead server must NOT re-introduce the startup hang — the join is + bounded by the timeout and returns even though the thread is still alive.""" + saved = entry._mcp_discovery_thread + stop = threading.Event() + try: + t = threading.Thread(target=stop.wait, daemon=True) # blocks until set + t.start() + entry._mcp_discovery_thread = t + start = time.monotonic() + entry.wait_for_mcp_discovery(timeout=0.3) + elapsed = time.monotonic() - start + assert 0.25 <= elapsed < 1.0 # bounded near the timeout, not forever + assert t.is_alive() # thread still running; we did not block on it + finally: + stop.set() + _restore_thread_slot(saved) diff --git a/tui_gateway/entry.py b/tui_gateway/entry.py index 0400a3fcb..7069ec976 100644 --- a/tui_gateway/entry.py +++ b/tui_gateway/entry.py @@ -12,6 +12,7 @@ if _src_root and _src_root not in sys.path: sys.path = [p for p in sys.path if p not in {"", "."}] import json +import logging import signal import time import traceback @@ -20,6 +21,13 @@ from tui_gateway import server from tui_gateway.server import _CRASH_LOG, dispatch, resolve_skin, write_json from tui_gateway.transport import TeeTransport +logger = logging.getLogger(__name__) + +# Handle for the background MCP tool-discovery thread (see main()). The first +# agent build briefly joins this so already-spawning fast servers land before +# the agent snapshots its tool list (see wait_for_mcp_discovery). +_mcp_discovery_thread = None + def _install_sidecar_publisher() -> None: """Mirror every dispatcher emit to the dashboard sidebar via WS. @@ -184,37 +192,76 @@ def _log_exit(reason: str) -> None: print(f"[gateway-exit] {reason}", file=sys.stderr, flush=True) +def wait_for_mcp_discovery(timeout: float = 0.75) -> None: + """Briefly block until background MCP discovery finishes, up to ``timeout``. + + MCP discovery runs in a daemon thread spawned at startup (see main()) so a + slow/dead server can't freeze ``gateway.ready``. But the agent snapshots + its tool list ONCE at build time and never re-reads it, so a reachable-but- + slow server that finishes connecting *after* the first prompt would be + invisible for the whole session. Joining with a short bounded timeout + before the first agent build lets already-spawning fast servers land + without re-introducing the startup hang: a dead server simply isn't waited + on beyond ``timeout``. No-op when no discovery thread was started. + """ + thread = _mcp_discovery_thread + if thread is None or not thread.is_alive(): + return + thread.join(timeout=timeout) + + def main(): _install_sidecar_publisher() - # MCP tool discovery — inline is safe here: TUI entry is a plain - # sync loop with no asyncio event loop to block. Previously ran as - # a model_tools.py module-level side effect; moved to explicit - # startup calls to avoid freezing the gateway's loop on lazy import - # (#16856). + # MCP tool discovery — runs in a background daemon thread so a slow or + # unreachable MCP server can't freeze TUI startup. Previously this ran + # inline before ``gateway.ready``, which meant any configured-but-down + # server stalled the whole shell on "summoning hermes…" for the full + # connect-retry backoff (e.g. a dead stdio/http server burns 1+2+4s of + # retries → ~7s of dead air before the composer appears). Discovery is + # idempotent and registers tools into the shared registry as servers + # connect. The agent isn't built until the first prompt, at which point + # ``_make_agent`` briefly joins this thread (``wait_for_mcp_discovery``, + # bounded) so already-spawning fast servers land in the tool snapshot — + # a dead server is simply not waited on past the bound. ``/reload-mcp`` + # rebuilds the snapshot for servers that connect later in the session. # # Cold-start guard: importing ``tools.mcp_tool`` transitively pulls the # full MCP SDK (mcp, pydantic, httpx, jsonschema, starlette parsers — - # ~200ms on macOS), which runs on the TUI's critical path before - # ``gateway.ready`` can be emitted. The overwhelming majority of users - # have no ``mcp_servers`` configured, in which case every byte of that - # import is wasted. Check the config first (cheap — it's already been - # loaded once by ``_config_mtime`` elsewhere) and only pay the import - # cost when there's actually MCP work to do. + # ~200ms on macOS). The overwhelming majority of users have no + # ``mcp_servers`` configured, in which case every byte of that import is + # wasted. Check the config first (cheap) and only spawn the discovery + # thread when there's actually MCP work to do, so the import cost stays + # off the path entirely for the common case. try: from hermes_cli.config import read_raw_config _mcp_servers = (read_raw_config() or {}).get("mcp_servers") _has_mcp_servers = isinstance(_mcp_servers, dict) and len(_mcp_servers) > 0 except Exception: - # Be conservative: if we can't decide, fall back to the old - # behaviour and let the discovery path handle its own errors. + # Be conservative: if we can't decide, fall back to attempting + # discovery (still backgrounded, so it can't block startup). _has_mcp_servers = True if _has_mcp_servers: - try: - from tools.mcp_tool import discover_mcp_tools - discover_mcp_tools() - except Exception: - pass + def _discover_mcp_background() -> None: + try: + from tools.mcp_tool import discover_mcp_tools + discover_mcp_tools() + except Exception: + logger.warning( + "Background MCP tool discovery failed", exc_info=True + ) + + import threading as _mcp_threading + _mcp_thread = _mcp_threading.Thread( + target=_discover_mcp_background, + name="tui-mcp-discovery", + daemon=True, + ) + _mcp_thread.start() + # Publish the handle so the first agent build can briefly wait for + # already-spawning fast servers to land (see wait_for_mcp_discovery). + global _mcp_discovery_thread + _mcp_discovery_thread = _mcp_thread if not write_json({ "jsonrpc": "2.0", diff --git a/tui_gateway/server.py b/tui_gateway/server.py index 77d1ea502..4af8e2887 100644 --- a/tui_gateway/server.py +++ b/tui_gateway/server.py @@ -2005,6 +2005,19 @@ def _make_agent(sid: str, key: str, session_id: str | None = None): from run_agent import AIAgent from hermes_cli.runtime_provider import resolve_runtime_provider + # MCP tool discovery runs in a background daemon thread at startup so a + # dead server can't freeze the shell (see tui_gateway/entry.py). The agent + # snapshots its tool list once here and never re-reads it, so briefly wait + # for in-flight discovery to land before building — bounded, so a slow/dead + # server still can't block. No-op once discovery has finished (every build + # after the first during a slow startup). + try: + from tui_gateway.entry import wait_for_mcp_discovery + + wait_for_mcp_discovery() + except Exception: + pass + cfg = _load_cfg() agent_cfg = cfg.get("agent") or {} system_prompt = (agent_cfg.get("system_prompt", "") or "").strip() @@ -4690,8 +4703,28 @@ def _(rid, params: dict) -> dict: discover_mcp_tools() if session: agent = session["agent"] - if hasattr(agent, "refresh_tools"): - agent.refresh_tools() + # Rebuild the cached agent's tool snapshot so the current session + # picks up added/removed MCP tools without `/new` (which discards + # history). The agent snapshots tools once at build and never + # re-reads the registry, so an explicit rebuild is required here. + # The user already consented to the prompt-cache invalidation via + # the confirm gate above. Mirrors gateway/run.py::_execute_mcp_reload. + try: + from model_tools import get_tool_definitions + + new_defs = get_tool_definitions( + enabled_toolsets=_load_enabled_toolsets(), + quiet_mode=True, + ) + agent.tools = new_defs + agent.valid_tool_names = ( + {t["function"]["name"] for t in new_defs} if new_defs else set() + ) + except Exception as _exc: + logger.warning( + "Failed to refresh cached agent tools after /reload-mcp: %s", + _exc, + ) _emit("session.info", params.get("session_id", ""), _session_info(agent)) # Honor `always=true` by persisting the opt-out to config.