perf(tui): stop slow/dead MCP servers from freezing TUI startup
The 'summoning hermes…' phase blocked on gateway.ready, which ran MCP tool discovery inline. Any configured-but-unreachable MCP server burned its full connect-retry backoff (1+2+4s ≈ 7s) before the composer appeared — startup went from instant to ~7.5s of dead air for anyone with a down stdio/http server in mcp_servers. Move discovery into a background daemon thread so gateway.ready fires immediately; tools register into the shared registry as servers connect, and the agent isn't built until the first prompt. Measured spawn→ready: ~7500ms → ~115ms (dead twozero_td server in config). Also drop rich.console + prompt_toolkit off banner.py's import path (lazy-imported inside cprint/build_welcome_banner). tui_gateway.server imports banner only to reach the lightweight prefetch_update_check helper; the eager rich/pt imports added ~45ms before gateway.ready for no benefit. tui_gateway.server import: ~115ms → ~69ms.
This commit is contained in:
@ -12,14 +12,16 @@ import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from hermes_constants import get_hermes_home
|
||||
from typing import Dict, List, Optional
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional
|
||||
|
||||
from rich.console import Console
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
|
||||
from prompt_toolkit import print_formatted_text as _pt_print
|
||||
from prompt_toolkit.formatted_text import ANSI as _PT_ANSI
|
||||
# rich and prompt_toolkit are imported lazily (inside the functions that use
|
||||
# them) rather than at module level. Importing this module is on the TUI
|
||||
# gateway's critical startup path purely to reach the lightweight update-check
|
||||
# helpers (``prefetch_update_check``); pulling rich.console + prompt_toolkit
|
||||
# eagerly added ~50ms of wasted imports before ``gateway.ready`` could fire.
|
||||
# Keep the type-only reference available to checkers without the runtime cost.
|
||||
if TYPE_CHECKING:
|
||||
from rich.console import Console
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -36,6 +38,8 @@ _RST = "\033[0m"
|
||||
|
||||
def cprint(text: str):
|
||||
"""Print ANSI-colored text through prompt_toolkit's renderer."""
|
||||
from prompt_toolkit import print_formatted_text as _pt_print
|
||||
from prompt_toolkit.formatted_text import ANSI as _PT_ANSI
|
||||
_pt_print(_PT_ANSI(text))
|
||||
|
||||
|
||||
@ -471,7 +475,7 @@ def _display_toolset_name(toolset_name: str) -> str:
|
||||
)
|
||||
|
||||
|
||||
def build_welcome_banner(console: Console, model: str, cwd: str,
|
||||
def build_welcome_banner(console: "Console", model: str, cwd: str,
|
||||
tools: List[dict] = None,
|
||||
enabled_toolsets: List[str] = None,
|
||||
session_id: str = None,
|
||||
@ -490,6 +494,8 @@ def build_welcome_banner(console: Console, model: str, cwd: str,
|
||||
context_length: Model's context window size in tokens.
|
||||
"""
|
||||
from model_tools import check_tool_availability, TOOLSET_REQUIREMENTS
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
if get_toolset_for_tool is None:
|
||||
from model_tools import get_toolset_for_tool
|
||||
|
||||
|
||||
78
tests/tui_gateway/test_wait_for_mcp_discovery.py
Normal file
78
tests/tui_gateway/test_wait_for_mcp_discovery.py
Normal file
@ -0,0 +1,78 @@
|
||||
"""Tests for tui_gateway.entry.wait_for_mcp_discovery (PR #35245).
|
||||
|
||||
MCP tool discovery runs in a background daemon thread so a slow/dead server
|
||||
can't freeze ``gateway.ready``. The agent snapshots its tool list once at
|
||||
build time and never re-reads it, so ``_make_agent`` briefly joins the
|
||||
discovery thread before building — bounded, so a dead server can't re-introduce
|
||||
the startup hang, and a no-op once discovery has finished.
|
||||
"""
|
||||
|
||||
import threading
|
||||
import time
|
||||
|
||||
import tui_gateway.entry as entry
|
||||
|
||||
|
||||
def _restore_thread_slot(saved):
|
||||
entry._mcp_discovery_thread = saved
|
||||
|
||||
|
||||
def test_no_thread_is_noop():
|
||||
"""When no discovery thread was started (the common no-MCP case), the
|
||||
helper returns immediately and never blocks."""
|
||||
saved = entry._mcp_discovery_thread
|
||||
try:
|
||||
entry._mcp_discovery_thread = None
|
||||
start = time.monotonic()
|
||||
entry.wait_for_mcp_discovery(timeout=5.0)
|
||||
assert time.monotonic() - start < 0.1
|
||||
finally:
|
||||
_restore_thread_slot(saved)
|
||||
|
||||
|
||||
def test_already_finished_thread_is_noop():
|
||||
"""A thread that has already finished is not joined-on (dead thread)."""
|
||||
saved = entry._mcp_discovery_thread
|
||||
try:
|
||||
t = threading.Thread(target=lambda: None, daemon=True)
|
||||
t.start()
|
||||
t.join() # ensure it's finished
|
||||
entry._mcp_discovery_thread = t
|
||||
start = time.monotonic()
|
||||
entry.wait_for_mcp_discovery(timeout=5.0)
|
||||
assert time.monotonic() - start < 0.1
|
||||
finally:
|
||||
_restore_thread_slot(saved)
|
||||
|
||||
|
||||
def test_fast_thread_is_joined():
|
||||
"""A reachable-but-still-connecting (fast) server lands before the agent
|
||||
snapshots tools — the helper waits for it to finish."""
|
||||
saved = entry._mcp_discovery_thread
|
||||
try:
|
||||
t = threading.Thread(target=lambda: time.sleep(0.05), daemon=True)
|
||||
t.start()
|
||||
entry._mcp_discovery_thread = t
|
||||
entry.wait_for_mcp_discovery(timeout=1.0)
|
||||
assert not t.is_alive() # joined to completion
|
||||
finally:
|
||||
_restore_thread_slot(saved)
|
||||
|
||||
|
||||
def test_hung_thread_is_bounded_by_timeout():
|
||||
"""A slow/dead server must NOT re-introduce the startup hang — the join is
|
||||
bounded by the timeout and returns even though the thread is still alive."""
|
||||
saved = entry._mcp_discovery_thread
|
||||
stop = threading.Event()
|
||||
try:
|
||||
t = threading.Thread(target=stop.wait, daemon=True) # blocks until set
|
||||
t.start()
|
||||
entry._mcp_discovery_thread = t
|
||||
start = time.monotonic()
|
||||
entry.wait_for_mcp_discovery(timeout=0.3)
|
||||
elapsed = time.monotonic() - start
|
||||
assert 0.25 <= elapsed < 1.0 # bounded near the timeout, not forever
|
||||
assert t.is_alive() # thread still running; we did not block on it
|
||||
finally:
|
||||
stop.set()
|
||||
_restore_thread_slot(saved)
|
||||
@ -12,6 +12,7 @@ if _src_root and _src_root not in sys.path:
|
||||
sys.path = [p for p in sys.path if p not in {"", "."}]
|
||||
|
||||
import json
|
||||
import logging
|
||||
import signal
|
||||
import time
|
||||
import traceback
|
||||
@ -20,6 +21,13 @@ from tui_gateway import server
|
||||
from tui_gateway.server import _CRASH_LOG, dispatch, resolve_skin, write_json
|
||||
from tui_gateway.transport import TeeTransport
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Handle for the background MCP tool-discovery thread (see main()). The first
|
||||
# agent build briefly joins this so already-spawning fast servers land before
|
||||
# the agent snapshots its tool list (see wait_for_mcp_discovery).
|
||||
_mcp_discovery_thread = None
|
||||
|
||||
|
||||
def _install_sidecar_publisher() -> None:
|
||||
"""Mirror every dispatcher emit to the dashboard sidebar via WS.
|
||||
@ -184,37 +192,76 @@ def _log_exit(reason: str) -> None:
|
||||
print(f"[gateway-exit] {reason}", file=sys.stderr, flush=True)
|
||||
|
||||
|
||||
def wait_for_mcp_discovery(timeout: float = 0.75) -> None:
|
||||
"""Briefly block until background MCP discovery finishes, up to ``timeout``.
|
||||
|
||||
MCP discovery runs in a daemon thread spawned at startup (see main()) so a
|
||||
slow/dead server can't freeze ``gateway.ready``. But the agent snapshots
|
||||
its tool list ONCE at build time and never re-reads it, so a reachable-but-
|
||||
slow server that finishes connecting *after* the first prompt would be
|
||||
invisible for the whole session. Joining with a short bounded timeout
|
||||
before the first agent build lets already-spawning fast servers land
|
||||
without re-introducing the startup hang: a dead server simply isn't waited
|
||||
on beyond ``timeout``. No-op when no discovery thread was started.
|
||||
"""
|
||||
thread = _mcp_discovery_thread
|
||||
if thread is None or not thread.is_alive():
|
||||
return
|
||||
thread.join(timeout=timeout)
|
||||
|
||||
|
||||
def main():
|
||||
_install_sidecar_publisher()
|
||||
|
||||
# MCP tool discovery — inline is safe here: TUI entry is a plain
|
||||
# sync loop with no asyncio event loop to block. Previously ran as
|
||||
# a model_tools.py module-level side effect; moved to explicit
|
||||
# startup calls to avoid freezing the gateway's loop on lazy import
|
||||
# (#16856).
|
||||
# MCP tool discovery — runs in a background daemon thread so a slow or
|
||||
# unreachable MCP server can't freeze TUI startup. Previously this ran
|
||||
# inline before ``gateway.ready``, which meant any configured-but-down
|
||||
# server stalled the whole shell on "summoning hermes…" for the full
|
||||
# connect-retry backoff (e.g. a dead stdio/http server burns 1+2+4s of
|
||||
# retries → ~7s of dead air before the composer appears). Discovery is
|
||||
# idempotent and registers tools into the shared registry as servers
|
||||
# connect. The agent isn't built until the first prompt, at which point
|
||||
# ``_make_agent`` briefly joins this thread (``wait_for_mcp_discovery``,
|
||||
# bounded) so already-spawning fast servers land in the tool snapshot —
|
||||
# a dead server is simply not waited on past the bound. ``/reload-mcp``
|
||||
# rebuilds the snapshot for servers that connect later in the session.
|
||||
#
|
||||
# Cold-start guard: importing ``tools.mcp_tool`` transitively pulls the
|
||||
# full MCP SDK (mcp, pydantic, httpx, jsonschema, starlette parsers —
|
||||
# ~200ms on macOS), which runs on the TUI's critical path before
|
||||
# ``gateway.ready`` can be emitted. The overwhelming majority of users
|
||||
# have no ``mcp_servers`` configured, in which case every byte of that
|
||||
# import is wasted. Check the config first (cheap — it's already been
|
||||
# loaded once by ``_config_mtime`` elsewhere) and only pay the import
|
||||
# cost when there's actually MCP work to do.
|
||||
# ~200ms on macOS). The overwhelming majority of users have no
|
||||
# ``mcp_servers`` configured, in which case every byte of that import is
|
||||
# wasted. Check the config first (cheap) and only spawn the discovery
|
||||
# thread when there's actually MCP work to do, so the import cost stays
|
||||
# off the path entirely for the common case.
|
||||
try:
|
||||
from hermes_cli.config import read_raw_config
|
||||
_mcp_servers = (read_raw_config() or {}).get("mcp_servers")
|
||||
_has_mcp_servers = isinstance(_mcp_servers, dict) and len(_mcp_servers) > 0
|
||||
except Exception:
|
||||
# Be conservative: if we can't decide, fall back to the old
|
||||
# behaviour and let the discovery path handle its own errors.
|
||||
# Be conservative: if we can't decide, fall back to attempting
|
||||
# discovery (still backgrounded, so it can't block startup).
|
||||
_has_mcp_servers = True
|
||||
if _has_mcp_servers:
|
||||
try:
|
||||
from tools.mcp_tool import discover_mcp_tools
|
||||
discover_mcp_tools()
|
||||
except Exception:
|
||||
pass
|
||||
def _discover_mcp_background() -> None:
|
||||
try:
|
||||
from tools.mcp_tool import discover_mcp_tools
|
||||
discover_mcp_tools()
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"Background MCP tool discovery failed", exc_info=True
|
||||
)
|
||||
|
||||
import threading as _mcp_threading
|
||||
_mcp_thread = _mcp_threading.Thread(
|
||||
target=_discover_mcp_background,
|
||||
name="tui-mcp-discovery",
|
||||
daemon=True,
|
||||
)
|
||||
_mcp_thread.start()
|
||||
# Publish the handle so the first agent build can briefly wait for
|
||||
# already-spawning fast servers to land (see wait_for_mcp_discovery).
|
||||
global _mcp_discovery_thread
|
||||
_mcp_discovery_thread = _mcp_thread
|
||||
|
||||
if not write_json({
|
||||
"jsonrpc": "2.0",
|
||||
|
||||
@ -2005,6 +2005,19 @@ def _make_agent(sid: str, key: str, session_id: str | None = None):
|
||||
from run_agent import AIAgent
|
||||
from hermes_cli.runtime_provider import resolve_runtime_provider
|
||||
|
||||
# MCP tool discovery runs in a background daemon thread at startup so a
|
||||
# dead server can't freeze the shell (see tui_gateway/entry.py). The agent
|
||||
# snapshots its tool list once here and never re-reads it, so briefly wait
|
||||
# for in-flight discovery to land before building — bounded, so a slow/dead
|
||||
# server still can't block. No-op once discovery has finished (every build
|
||||
# after the first during a slow startup).
|
||||
try:
|
||||
from tui_gateway.entry import wait_for_mcp_discovery
|
||||
|
||||
wait_for_mcp_discovery()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
cfg = _load_cfg()
|
||||
agent_cfg = cfg.get("agent") or {}
|
||||
system_prompt = (agent_cfg.get("system_prompt", "") or "").strip()
|
||||
@ -4690,8 +4703,28 @@ def _(rid, params: dict) -> dict:
|
||||
discover_mcp_tools()
|
||||
if session:
|
||||
agent = session["agent"]
|
||||
if hasattr(agent, "refresh_tools"):
|
||||
agent.refresh_tools()
|
||||
# Rebuild the cached agent's tool snapshot so the current session
|
||||
# picks up added/removed MCP tools without `/new` (which discards
|
||||
# history). The agent snapshots tools once at build and never
|
||||
# re-reads the registry, so an explicit rebuild is required here.
|
||||
# The user already consented to the prompt-cache invalidation via
|
||||
# the confirm gate above. Mirrors gateway/run.py::_execute_mcp_reload.
|
||||
try:
|
||||
from model_tools import get_tool_definitions
|
||||
|
||||
new_defs = get_tool_definitions(
|
||||
enabled_toolsets=_load_enabled_toolsets(),
|
||||
quiet_mode=True,
|
||||
)
|
||||
agent.tools = new_defs
|
||||
agent.valid_tool_names = (
|
||||
{t["function"]["name"] for t in new_defs} if new_defs else set()
|
||||
)
|
||||
except Exception as _exc:
|
||||
logger.warning(
|
||||
"Failed to refresh cached agent tools after /reload-mcp: %s",
|
||||
_exc,
|
||||
)
|
||||
_emit("session.info", params.get("session_id", ""), _session_info(agent))
|
||||
|
||||
# Honor `always=true` by persisting the opt-out to config.
|
||||
|
||||
Reference in New Issue
Block a user