diff --git a/hermes_cli/main.py b/hermes_cli/main.py index 9ca0adbf0..a13e21ceb 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -1626,6 +1626,77 @@ def _normalize_tui_toolsets(toolsets: object) -> list[str]: return [item for item in normalized if item] +def _read_cgroup_memory_limit() -> Optional[int]: + """Return the container memory limit in bytes, or None if unconstrained. + + Node's V8 heap is NOT cgroup-aware: with a flat ``--max-old-space-size=8192`` + it happily grows the heap toward 8GB regardless of the container's real + memory limit. In a Docker/k8s container capped below ~9-10GB, the cgroup + OOM-killer SIGKILLs Node before V8's own heap monitor ever fires — which + runs no JS handler, writes no ``[tui-parent]`` breadcrumb, and the user + sees only a bare gateway ``stdin EOF``. Reading the real cgroup limit lets + us size the heap cap below it so V8 GCs/exits gracefully instead of being + reaped silently. + + Checks cgroup v2 (``/sys/fs/cgroup/memory.max``) then v1 + (``/sys/fs/cgroup/memory/memory.limit_in_bytes``). A literal ``max`` (v2) + or the v1 "unlimited" sentinel (a huge near-INT64 value) means no limit. + """ + candidates = ( + "/sys/fs/cgroup/memory.max", # cgroup v2 + "/sys/fs/cgroup/memory/memory.limit_in_bytes", # cgroup v1 + ) + for path in candidates: + try: + with open(path, "r", encoding="utf-8") as f: + raw = f.read().strip() + except (OSError, ValueError): + continue + if raw == "max": + return None + if not raw: + # Blank/empty file: no usable value here. Fall through to the next + # candidate (don't mistake an empty v2 file for "unlimited"). + continue + try: + limit = int(raw) + except ValueError: + continue + if limit <= 0: + continue + # cgroup v1 reports "unlimited" as a huge value (often + # 0x7FFFFFFFFFFFF000 ≈ 9.2 EB, sometimes PAGE_COUNTER_MAX). Anything + # at/above ~1 PB is effectively unconstrained — treat as no limit. + if limit >= (1 << 50): + return None + return limit + return None + + +def _resolve_tui_heap_mb(default_mb: int = 8192) -> int: + """Pick a V8 ``--max-old-space-size`` (MB) that fits the container. + + Returns ``default_mb`` (8192) when unconstrained or when the box is large + enough that 8GB fits. In a memory-limited container, returns ~75% of the + cgroup limit so the heap + non-heap RSS stays under the cgroup ceiling, + clamped to a sane floor (1536MB — below this V8 GC-thrashes and the TUI + is barely usable). Never exceeds ``default_mb``. + """ + limit = _read_cgroup_memory_limit() + if not limit: + return default_mb + limit_mb = limit // (1024 * 1024) + # Leave headroom for non-heap RSS (Node internals, buffers, the Python + # gateway child shares the same cgroup): cap the heap at 75% of the limit. + sized = int(limit_mb * 0.75) + if sized >= default_mb: + return default_mb + # Floor so a tiny limit doesn't drive V8 into constant GC. If the container + # is smaller than the floor, honor the limit-derived value anyway (better a + # graceful V8 exit than a silent cgroup kill). + return max(1536, sized) if limit_mb > 2048 else sized + + def _launch_tui( resume_session_id: Optional[str] = None, tui_dev: bool = False, @@ -1721,16 +1792,23 @@ def _launch_tui( env["HERMES_TUI_TOOL_PROGRESS"] = "off" if accept_hooks: env["HERMES_ACCEPT_HOOKS"] = "1" - # Guarantee an 8GB V8 heap for the TUI. Default node cap is ~1.5–4GB + # Guarantee a generous V8 heap for the TUI. Default node cap is ~1.5–4GB # depending on version and can fatal-OOM on long sessions with large - # transcripts / reasoning blobs. Token-level merge: respect any + # transcripts / reasoning blobs. We target 8GB on an unconstrained host, + # but V8 is NOT cgroup-aware: in a memory-limited Docker/k8s container a + # flat 8GB heap grows past the container limit and the cgroup OOM-killer + # SIGKILLs Node — running no JS handler, writing no breadcrumb, leaving the + # user with only a bare gateway `stdin EOF`. _resolve_tui_heap_mb() reads + # the real cgroup limit and sizes the cap below it so V8 GCs/exits + # gracefully (and the memory monitor's onCritical breadcrumb can fire) + # instead of being reaped silently. Token-level merge: respect any # user-supplied --max-old-space-size (they may have set it higher). # --expose-gc is *not* added here: Node rejects it in NODE_OPTIONS # ("--expose-gc is not allowed in NODE_OPTIONS") and refuses to start. # It is passed as a direct argv flag in _make_tui_argv() instead. _tokens = env.get("NODE_OPTIONS", "").split() if not any(t.startswith("--max-old-space-size=") for t in _tokens): - _tokens.append("--max-old-space-size=8192") + _tokens.append(f"--max-old-space-size={_resolve_tui_heap_mb()}") env["NODE_OPTIONS"] = " ".join(_tokens) # HERMES_TUI_RESUME is an internal hand-off from the Python wrapper to the # Ink app. Because we start from os.environ.copy(), an exported/stale value diff --git a/tests/hermes_cli/test_tui_heap_sizing.py b/tests/hermes_cli/test_tui_heap_sizing.py new file mode 100644 index 000000000..ab172a861 --- /dev/null +++ b/tests/hermes_cli/test_tui_heap_sizing.py @@ -0,0 +1,123 @@ +"""Tests for cgroup-aware TUI V8 heap sizing. + +V8 is not cgroup-aware: a flat ``--max-old-space-size=8192`` lets the heap grow +toward 8GB in a memory-limited container, so the cgroup OOM-killer SIGKILLs Node +before V8's own monitor fires — leaving the user with only a bare gateway +``stdin EOF`` and no breadcrumb. ``_resolve_tui_heap_mb`` reads the real cgroup +limit and sizes the cap below it so V8 exits gracefully instead. +""" + +import builtins +import io +from unittest import mock + +import hermes_cli.main as m + +V2 = "/sys/fs/cgroup/memory.max" +V1 = "/sys/fs/cgroup/memory/memory.limit_in_bytes" +GB = 1024 ** 3 + + +def _fake_open(files: dict): + """Return an open() shim serving cgroup paths from ``files`` (path->str).""" + real_open = builtins.open + + def opener(path, *args, **kwargs): + if path in (V2, V1): + content = files.get(path) + if content is None: + raise FileNotFoundError(path) + return io.StringIO(content) + return real_open(path, *args, **kwargs) + + return opener + + +def _read(files: dict): + with mock.patch.object(builtins, "open", _fake_open(files)): + return m._read_cgroup_memory_limit() + + +class TestReadCgroupMemoryLimit: + def test_v2_max_is_unlimited(self): + assert _read({V2: "max"}) is None + + def test_v2_numeric_limit(self): + assert _read({V2: str(4 * GB)}) == 4 * GB + + def test_v1_unlimited_sentinel_is_none(self): + # cgroup v1 reports "unlimited" as a near-INT64 huge value. + assert _read({V1: "9223372036854771712"}) is None + + def test_v1_numeric_limit_when_no_v2(self): + assert _read({V1: str(2 * GB)}) == 2 * GB + + def test_no_files_present(self): + assert _read({}) is None + + def test_empty_v2_falls_through_to_v1(self): + # A blank v2 file must NOT be mistaken for "unlimited" — fall to v1. + assert _read({V2: "", V1: str(3 * GB)}) == 3 * GB + + def test_v2_wins_over_v1(self): + assert _read({V2: str(6 * GB), V1: str(2 * GB)}) == 6 * GB + + def test_zero_is_skipped(self): + assert _read({V2: "0"}) is None + + def test_petabyte_plus_treated_as_unlimited(self): + assert _read({V2: str(1 << 51)}) is None + + +class TestResolveTuiHeapMb: + def _resolve(self, limit_bytes): + with mock.patch.object(m, "_read_cgroup_memory_limit", return_value=limit_bytes): + return m._resolve_tui_heap_mb() + + def test_unconstrained_uses_default(self): + assert self._resolve(None) == 8192 + + def test_large_container_clamps_to_default(self): + # 16GB -> 75% = 12288 >= 8192 -> clamp to 8192. + assert self._resolve(16 * GB) == 8192 + + def test_4gb_container_75_percent(self): + assert self._resolve(4 * GB) == 3072 + + def test_3gb_container_above_floor(self): + assert self._resolve(3 * GB) == 2304 + + def test_2gb_container_at_floor(self): + assert self._resolve(2 * GB) == 1536 + + def test_tiny_container_honors_limit_below_floor(self): + # 1GB -> 75% = 768; honored even though below the 1536 floor, because a + # graceful V8 exit beats a silent cgroup SIGKILL. + assert self._resolve(1 * GB) == 768 + + def test_never_exceeds_default(self): + assert self._resolve(64 * GB) == 8192 + + +class TestNodeOptionsTokenMerge: + """The _launch_tui token-merge block must add the sized cap unless the user + already supplied one, and must preserve unrelated NODE_OPTIONS flags.""" + + def _merge(self, node_options, limit_bytes): + with mock.patch.object(m, "_read_cgroup_memory_limit", return_value=limit_bytes): + tokens = node_options.split() + if not any(t.startswith("--max-old-space-size=") for t in tokens): + tokens.append(f"--max-old-space-size={m._resolve_tui_heap_mb()}") + return " ".join(tokens) + + def test_unconstrained_empty(self): + assert self._merge("", None) == "--max-old-space-size=8192" + + def test_constrained_container(self): + assert self._merge("", 4 * GB) == "--max-old-space-size=3072" + + def test_user_override_respected(self): + assert self._merge("--max-old-space-size=12288", 2 * GB) == "--max-old-space-size=12288" + + def test_preserves_other_flags(self): + assert self._merge("--enable-source-maps", 4 * GB) == "--enable-source-maps --max-old-space-size=3072"