perf(tui): stop slow/dead MCP servers from freezing TUI startup
The 'summoning hermes…' phase blocked on gateway.ready, which ran MCP tool discovery inline. Any configured-but-unreachable MCP server burned its full connect-retry backoff (1+2+4s ≈ 7s) before the composer appeared — startup went from instant to ~7.5s of dead air for anyone with a down stdio/http server in mcp_servers. Move discovery into a background daemon thread so gateway.ready fires immediately; tools register into the shared registry as servers connect, and the agent isn't built until the first prompt. Measured spawn→ready: ~7500ms → ~115ms (dead twozero_td server in config). Also drop rich.console + prompt_toolkit off banner.py's import path (lazy-imported inside cprint/build_welcome_banner). tui_gateway.server imports banner only to reach the lightweight prefetch_update_check helper; the eager rich/pt imports added ~45ms before gateway.ready for no benefit. tui_gateway.server import: ~115ms → ~69ms.
This commit is contained in:
@ -12,14 +12,16 @@ import threading
|
|||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from hermes_constants import get_hermes_home
|
from hermes_constants import get_hermes_home
|
||||||
from typing import Dict, List, Optional
|
from typing import TYPE_CHECKING, Dict, List, Optional
|
||||||
|
|
||||||
from rich.console import Console
|
# rich and prompt_toolkit are imported lazily (inside the functions that use
|
||||||
from rich.panel import Panel
|
# them) rather than at module level. Importing this module is on the TUI
|
||||||
from rich.table import Table
|
# gateway's critical startup path purely to reach the lightweight update-check
|
||||||
|
# helpers (``prefetch_update_check``); pulling rich.console + prompt_toolkit
|
||||||
from prompt_toolkit import print_formatted_text as _pt_print
|
# eagerly added ~50ms of wasted imports before ``gateway.ready`` could fire.
|
||||||
from prompt_toolkit.formatted_text import ANSI as _PT_ANSI
|
# Keep the type-only reference available to checkers without the runtime cost.
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from rich.console import Console
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -36,6 +38,8 @@ _RST = "\033[0m"
|
|||||||
|
|
||||||
def cprint(text: str):
|
def cprint(text: str):
|
||||||
"""Print ANSI-colored text through prompt_toolkit's renderer."""
|
"""Print ANSI-colored text through prompt_toolkit's renderer."""
|
||||||
|
from prompt_toolkit import print_formatted_text as _pt_print
|
||||||
|
from prompt_toolkit.formatted_text import ANSI as _PT_ANSI
|
||||||
_pt_print(_PT_ANSI(text))
|
_pt_print(_PT_ANSI(text))
|
||||||
|
|
||||||
|
|
||||||
@ -471,7 +475,7 @@ def _display_toolset_name(toolset_name: str) -> str:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def build_welcome_banner(console: Console, model: str, cwd: str,
|
def build_welcome_banner(console: "Console", model: str, cwd: str,
|
||||||
tools: List[dict] = None,
|
tools: List[dict] = None,
|
||||||
enabled_toolsets: List[str] = None,
|
enabled_toolsets: List[str] = None,
|
||||||
session_id: str = None,
|
session_id: str = None,
|
||||||
@ -490,6 +494,8 @@ def build_welcome_banner(console: Console, model: str, cwd: str,
|
|||||||
context_length: Model's context window size in tokens.
|
context_length: Model's context window size in tokens.
|
||||||
"""
|
"""
|
||||||
from model_tools import check_tool_availability, TOOLSET_REQUIREMENTS
|
from model_tools import check_tool_availability, TOOLSET_REQUIREMENTS
|
||||||
|
from rich.panel import Panel
|
||||||
|
from rich.table import Table
|
||||||
if get_toolset_for_tool is None:
|
if get_toolset_for_tool is None:
|
||||||
from model_tools import get_toolset_for_tool
|
from model_tools import get_toolset_for_tool
|
||||||
|
|
||||||
|
|||||||
78
tests/tui_gateway/test_wait_for_mcp_discovery.py
Normal file
78
tests/tui_gateway/test_wait_for_mcp_discovery.py
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
"""Tests for tui_gateway.entry.wait_for_mcp_discovery (PR #35245).
|
||||||
|
|
||||||
|
MCP tool discovery runs in a background daemon thread so a slow/dead server
|
||||||
|
can't freeze ``gateway.ready``. The agent snapshots its tool list once at
|
||||||
|
build time and never re-reads it, so ``_make_agent`` briefly joins the
|
||||||
|
discovery thread before building — bounded, so a dead server can't re-introduce
|
||||||
|
the startup hang, and a no-op once discovery has finished.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
|
||||||
|
import tui_gateway.entry as entry
|
||||||
|
|
||||||
|
|
||||||
|
def _restore_thread_slot(saved):
|
||||||
|
entry._mcp_discovery_thread = saved
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_thread_is_noop():
|
||||||
|
"""When no discovery thread was started (the common no-MCP case), the
|
||||||
|
helper returns immediately and never blocks."""
|
||||||
|
saved = entry._mcp_discovery_thread
|
||||||
|
try:
|
||||||
|
entry._mcp_discovery_thread = None
|
||||||
|
start = time.monotonic()
|
||||||
|
entry.wait_for_mcp_discovery(timeout=5.0)
|
||||||
|
assert time.monotonic() - start < 0.1
|
||||||
|
finally:
|
||||||
|
_restore_thread_slot(saved)
|
||||||
|
|
||||||
|
|
||||||
|
def test_already_finished_thread_is_noop():
|
||||||
|
"""A thread that has already finished is not joined-on (dead thread)."""
|
||||||
|
saved = entry._mcp_discovery_thread
|
||||||
|
try:
|
||||||
|
t = threading.Thread(target=lambda: None, daemon=True)
|
||||||
|
t.start()
|
||||||
|
t.join() # ensure it's finished
|
||||||
|
entry._mcp_discovery_thread = t
|
||||||
|
start = time.monotonic()
|
||||||
|
entry.wait_for_mcp_discovery(timeout=5.0)
|
||||||
|
assert time.monotonic() - start < 0.1
|
||||||
|
finally:
|
||||||
|
_restore_thread_slot(saved)
|
||||||
|
|
||||||
|
|
||||||
|
def test_fast_thread_is_joined():
|
||||||
|
"""A reachable-but-still-connecting (fast) server lands before the agent
|
||||||
|
snapshots tools — the helper waits for it to finish."""
|
||||||
|
saved = entry._mcp_discovery_thread
|
||||||
|
try:
|
||||||
|
t = threading.Thread(target=lambda: time.sleep(0.05), daemon=True)
|
||||||
|
t.start()
|
||||||
|
entry._mcp_discovery_thread = t
|
||||||
|
entry.wait_for_mcp_discovery(timeout=1.0)
|
||||||
|
assert not t.is_alive() # joined to completion
|
||||||
|
finally:
|
||||||
|
_restore_thread_slot(saved)
|
||||||
|
|
||||||
|
|
||||||
|
def test_hung_thread_is_bounded_by_timeout():
|
||||||
|
"""A slow/dead server must NOT re-introduce the startup hang — the join is
|
||||||
|
bounded by the timeout and returns even though the thread is still alive."""
|
||||||
|
saved = entry._mcp_discovery_thread
|
||||||
|
stop = threading.Event()
|
||||||
|
try:
|
||||||
|
t = threading.Thread(target=stop.wait, daemon=True) # blocks until set
|
||||||
|
t.start()
|
||||||
|
entry._mcp_discovery_thread = t
|
||||||
|
start = time.monotonic()
|
||||||
|
entry.wait_for_mcp_discovery(timeout=0.3)
|
||||||
|
elapsed = time.monotonic() - start
|
||||||
|
assert 0.25 <= elapsed < 1.0 # bounded near the timeout, not forever
|
||||||
|
assert t.is_alive() # thread still running; we did not block on it
|
||||||
|
finally:
|
||||||
|
stop.set()
|
||||||
|
_restore_thread_slot(saved)
|
||||||
@ -12,6 +12,7 @@ if _src_root and _src_root not in sys.path:
|
|||||||
sys.path = [p for p in sys.path if p not in {"", "."}]
|
sys.path = [p for p in sys.path if p not in {"", "."}]
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
import signal
|
import signal
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
@ -20,6 +21,13 @@ from tui_gateway import server
|
|||||||
from tui_gateway.server import _CRASH_LOG, dispatch, resolve_skin, write_json
|
from tui_gateway.server import _CRASH_LOG, dispatch, resolve_skin, write_json
|
||||||
from tui_gateway.transport import TeeTransport
|
from tui_gateway.transport import TeeTransport
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Handle for the background MCP tool-discovery thread (see main()). The first
|
||||||
|
# agent build briefly joins this so already-spawning fast servers land before
|
||||||
|
# the agent snapshots its tool list (see wait_for_mcp_discovery).
|
||||||
|
_mcp_discovery_thread = None
|
||||||
|
|
||||||
|
|
||||||
def _install_sidecar_publisher() -> None:
|
def _install_sidecar_publisher() -> None:
|
||||||
"""Mirror every dispatcher emit to the dashboard sidebar via WS.
|
"""Mirror every dispatcher emit to the dashboard sidebar via WS.
|
||||||
@ -184,37 +192,76 @@ def _log_exit(reason: str) -> None:
|
|||||||
print(f"[gateway-exit] {reason}", file=sys.stderr, flush=True)
|
print(f"[gateway-exit] {reason}", file=sys.stderr, flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
def wait_for_mcp_discovery(timeout: float = 0.75) -> None:
|
||||||
|
"""Briefly block until background MCP discovery finishes, up to ``timeout``.
|
||||||
|
|
||||||
|
MCP discovery runs in a daemon thread spawned at startup (see main()) so a
|
||||||
|
slow/dead server can't freeze ``gateway.ready``. But the agent snapshots
|
||||||
|
its tool list ONCE at build time and never re-reads it, so a reachable-but-
|
||||||
|
slow server that finishes connecting *after* the first prompt would be
|
||||||
|
invisible for the whole session. Joining with a short bounded timeout
|
||||||
|
before the first agent build lets already-spawning fast servers land
|
||||||
|
without re-introducing the startup hang: a dead server simply isn't waited
|
||||||
|
on beyond ``timeout``. No-op when no discovery thread was started.
|
||||||
|
"""
|
||||||
|
thread = _mcp_discovery_thread
|
||||||
|
if thread is None or not thread.is_alive():
|
||||||
|
return
|
||||||
|
thread.join(timeout=timeout)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
_install_sidecar_publisher()
|
_install_sidecar_publisher()
|
||||||
|
|
||||||
# MCP tool discovery — inline is safe here: TUI entry is a plain
|
# MCP tool discovery — runs in a background daemon thread so a slow or
|
||||||
# sync loop with no asyncio event loop to block. Previously ran as
|
# unreachable MCP server can't freeze TUI startup. Previously this ran
|
||||||
# a model_tools.py module-level side effect; moved to explicit
|
# inline before ``gateway.ready``, which meant any configured-but-down
|
||||||
# startup calls to avoid freezing the gateway's loop on lazy import
|
# server stalled the whole shell on "summoning hermes…" for the full
|
||||||
# (#16856).
|
# connect-retry backoff (e.g. a dead stdio/http server burns 1+2+4s of
|
||||||
|
# retries → ~7s of dead air before the composer appears). Discovery is
|
||||||
|
# idempotent and registers tools into the shared registry as servers
|
||||||
|
# connect. The agent isn't built until the first prompt, at which point
|
||||||
|
# ``_make_agent`` briefly joins this thread (``wait_for_mcp_discovery``,
|
||||||
|
# bounded) so already-spawning fast servers land in the tool snapshot —
|
||||||
|
# a dead server is simply not waited on past the bound. ``/reload-mcp``
|
||||||
|
# rebuilds the snapshot for servers that connect later in the session.
|
||||||
#
|
#
|
||||||
# Cold-start guard: importing ``tools.mcp_tool`` transitively pulls the
|
# Cold-start guard: importing ``tools.mcp_tool`` transitively pulls the
|
||||||
# full MCP SDK (mcp, pydantic, httpx, jsonschema, starlette parsers —
|
# full MCP SDK (mcp, pydantic, httpx, jsonschema, starlette parsers —
|
||||||
# ~200ms on macOS), which runs on the TUI's critical path before
|
# ~200ms on macOS). The overwhelming majority of users have no
|
||||||
# ``gateway.ready`` can be emitted. The overwhelming majority of users
|
# ``mcp_servers`` configured, in which case every byte of that import is
|
||||||
# have no ``mcp_servers`` configured, in which case every byte of that
|
# wasted. Check the config first (cheap) and only spawn the discovery
|
||||||
# import is wasted. Check the config first (cheap — it's already been
|
# thread when there's actually MCP work to do, so the import cost stays
|
||||||
# loaded once by ``_config_mtime`` elsewhere) and only pay the import
|
# off the path entirely for the common case.
|
||||||
# cost when there's actually MCP work to do.
|
|
||||||
try:
|
try:
|
||||||
from hermes_cli.config import read_raw_config
|
from hermes_cli.config import read_raw_config
|
||||||
_mcp_servers = (read_raw_config() or {}).get("mcp_servers")
|
_mcp_servers = (read_raw_config() or {}).get("mcp_servers")
|
||||||
_has_mcp_servers = isinstance(_mcp_servers, dict) and len(_mcp_servers) > 0
|
_has_mcp_servers = isinstance(_mcp_servers, dict) and len(_mcp_servers) > 0
|
||||||
except Exception:
|
except Exception:
|
||||||
# Be conservative: if we can't decide, fall back to the old
|
# Be conservative: if we can't decide, fall back to attempting
|
||||||
# behaviour and let the discovery path handle its own errors.
|
# discovery (still backgrounded, so it can't block startup).
|
||||||
_has_mcp_servers = True
|
_has_mcp_servers = True
|
||||||
if _has_mcp_servers:
|
if _has_mcp_servers:
|
||||||
try:
|
def _discover_mcp_background() -> None:
|
||||||
from tools.mcp_tool import discover_mcp_tools
|
try:
|
||||||
discover_mcp_tools()
|
from tools.mcp_tool import discover_mcp_tools
|
||||||
except Exception:
|
discover_mcp_tools()
|
||||||
pass
|
except Exception:
|
||||||
|
logger.warning(
|
||||||
|
"Background MCP tool discovery failed", exc_info=True
|
||||||
|
)
|
||||||
|
|
||||||
|
import threading as _mcp_threading
|
||||||
|
_mcp_thread = _mcp_threading.Thread(
|
||||||
|
target=_discover_mcp_background,
|
||||||
|
name="tui-mcp-discovery",
|
||||||
|
daemon=True,
|
||||||
|
)
|
||||||
|
_mcp_thread.start()
|
||||||
|
# Publish the handle so the first agent build can briefly wait for
|
||||||
|
# already-spawning fast servers to land (see wait_for_mcp_discovery).
|
||||||
|
global _mcp_discovery_thread
|
||||||
|
_mcp_discovery_thread = _mcp_thread
|
||||||
|
|
||||||
if not write_json({
|
if not write_json({
|
||||||
"jsonrpc": "2.0",
|
"jsonrpc": "2.0",
|
||||||
|
|||||||
@ -2005,6 +2005,19 @@ def _make_agent(sid: str, key: str, session_id: str | None = None):
|
|||||||
from run_agent import AIAgent
|
from run_agent import AIAgent
|
||||||
from hermes_cli.runtime_provider import resolve_runtime_provider
|
from hermes_cli.runtime_provider import resolve_runtime_provider
|
||||||
|
|
||||||
|
# MCP tool discovery runs in a background daemon thread at startup so a
|
||||||
|
# dead server can't freeze the shell (see tui_gateway/entry.py). The agent
|
||||||
|
# snapshots its tool list once here and never re-reads it, so briefly wait
|
||||||
|
# for in-flight discovery to land before building — bounded, so a slow/dead
|
||||||
|
# server still can't block. No-op once discovery has finished (every build
|
||||||
|
# after the first during a slow startup).
|
||||||
|
try:
|
||||||
|
from tui_gateway.entry import wait_for_mcp_discovery
|
||||||
|
|
||||||
|
wait_for_mcp_discovery()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
cfg = _load_cfg()
|
cfg = _load_cfg()
|
||||||
agent_cfg = cfg.get("agent") or {}
|
agent_cfg = cfg.get("agent") or {}
|
||||||
system_prompt = (agent_cfg.get("system_prompt", "") or "").strip()
|
system_prompt = (agent_cfg.get("system_prompt", "") or "").strip()
|
||||||
@ -4690,8 +4703,28 @@ def _(rid, params: dict) -> dict:
|
|||||||
discover_mcp_tools()
|
discover_mcp_tools()
|
||||||
if session:
|
if session:
|
||||||
agent = session["agent"]
|
agent = session["agent"]
|
||||||
if hasattr(agent, "refresh_tools"):
|
# Rebuild the cached agent's tool snapshot so the current session
|
||||||
agent.refresh_tools()
|
# picks up added/removed MCP tools without `/new` (which discards
|
||||||
|
# history). The agent snapshots tools once at build and never
|
||||||
|
# re-reads the registry, so an explicit rebuild is required here.
|
||||||
|
# The user already consented to the prompt-cache invalidation via
|
||||||
|
# the confirm gate above. Mirrors gateway/run.py::_execute_mcp_reload.
|
||||||
|
try:
|
||||||
|
from model_tools import get_tool_definitions
|
||||||
|
|
||||||
|
new_defs = get_tool_definitions(
|
||||||
|
enabled_toolsets=_load_enabled_toolsets(),
|
||||||
|
quiet_mode=True,
|
||||||
|
)
|
||||||
|
agent.tools = new_defs
|
||||||
|
agent.valid_tool_names = (
|
||||||
|
{t["function"]["name"] for t in new_defs} if new_defs else set()
|
||||||
|
)
|
||||||
|
except Exception as _exc:
|
||||||
|
logger.warning(
|
||||||
|
"Failed to refresh cached agent tools after /reload-mcp: %s",
|
||||||
|
_exc,
|
||||||
|
)
|
||||||
_emit("session.info", params.get("session_id", ""), _session_info(agent))
|
_emit("session.info", params.get("session_id", ""), _session_info(agent))
|
||||||
|
|
||||||
# Honor `always=true` by persisting the opt-out to config.
|
# Honor `always=true` by persisting the opt-out to config.
|
||||||
|
|||||||
Reference in New Issue
Block a user