fix(tui): cgroup-aware V8 heap cap so memory-limited containers stop dying silently (#38541)

The TUI hardcoded --max-old-space-size=8192. V8 is not cgroup-aware, so in a Docker/k8s container capped below ~9-10GB the heap grows past the container limit and the cgroup OOM-killer SIGKILLs the Node parent BEFORE V8's own heap monitor fires. SIGKILL runs no JS handler, writes no [tui-parent] breadcrumb, and closes the gateway child's stdin — the user sees only a bare gateway 'stdin EOF'. Complements #38224 (trail-text cap), which reduced pressure but left the 8GB-vs-container mismatch in place. - _read_cgroup_memory_limit(): read cgroup v2 (memory.max) then v1 (memory.limit_in_bytes); handle 'max', the v1 unlimited sentinel, blank/zero, and >=1PB as unconstrained. - _resolve_tui_heap_mb(): unconstrained -> 8192; constrained -> 75% of the cgroup limit (headroom for non-heap RSS + the Python child sharing the cgroup), floored at 1536MB, never above 8192. - NODE_OPTIONS block uses the sized value; still respects a user-supplied --max-old-space-size. Net: V8 now GCs/exits gracefully (onCritical breadcrumb fires) instead of being reaped silently. Display/transport only — no agent context or behavior change. Tests: tests/hermes_cli/test_tui_heap_sizing.py (20 tests).
2026-06-03 16:40:28 -07:00
parent 8a19884bf3
commit 2f523a4691
2 changed files with 204 additions and 3 deletions
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@ -1626,6 +1626,77 @@ def _normalize_tui_toolsets(toolsets: object) -> list[str]:
        return [item for item in normalized if item]


+def _read_cgroup_memory_limit() -> Optional[int]:
+    """Return the container memory limit in bytes, or None if unconstrained.
+
+    Node's V8 heap is NOT cgroup-aware: with a flat ``--max-old-space-size=8192``
+    it happily grows the heap toward 8GB regardless of the container's real
+    memory limit.  In a Docker/k8s container capped below ~9-10GB, the cgroup
+    OOM-killer SIGKILLs Node before V8's own heap monitor ever fires — which
+    runs no JS handler, writes no ``[tui-parent]`` breadcrumb, and the user
+    sees only a bare gateway ``stdin EOF``.  Reading the real cgroup limit lets
+    us size the heap cap below it so V8 GCs/exits gracefully instead of being
+    reaped silently.
+
+    Checks cgroup v2 (``/sys/fs/cgroup/memory.max``) then v1
+    (``/sys/fs/cgroup/memory/memory.limit_in_bytes``).  A literal ``max`` (v2)
+    or the v1 "unlimited" sentinel (a huge near-INT64 value) means no limit.
+    """
+    candidates = (
+        "/sys/fs/cgroup/memory.max",  # cgroup v2
+        "/sys/fs/cgroup/memory/memory.limit_in_bytes",  # cgroup v1
+    )
+    for path in candidates:
+        try:
+            with open(path, "r", encoding="utf-8") as f:
+                raw = f.read().strip()
+        except (OSError, ValueError):
+            continue
+        if raw == "max":
+            return None
+        if not raw:
+            # Blank/empty file: no usable value here. Fall through to the next
+            # candidate (don't mistake an empty v2 file for "unlimited").
+            continue
+        try:
+            limit = int(raw)
+        except ValueError:
+            continue
+        if limit <= 0:
+            continue
+        # cgroup v1 reports "unlimited" as a huge value (often
+        # 0x7FFFFFFFFFFFF000 ≈ 9.2 EB, sometimes PAGE_COUNTER_MAX). Anything
+        # at/above ~1 PB is effectively unconstrained — treat as no limit.
+        if limit >= (1 << 50):
+            return None
+        return limit
+    return None
+
+
+def _resolve_tui_heap_mb(default_mb: int = 8192) -> int:
+    """Pick a V8 ``--max-old-space-size`` (MB) that fits the container.
+
+    Returns ``default_mb`` (8192) when unconstrained or when the box is large
+    enough that 8GB fits.  In a memory-limited container, returns ~75% of the
+    cgroup limit so the heap + non-heap RSS stays under the cgroup ceiling,
+    clamped to a sane floor (1536MB — below this V8 GC-thrashes and the TUI
+    is barely usable).  Never exceeds ``default_mb``.
+    """
+    limit = _read_cgroup_memory_limit()
+    if not limit:
+        return default_mb
+    limit_mb = limit // (1024 * 1024)
+    # Leave headroom for non-heap RSS (Node internals, buffers, the Python
+    # gateway child shares the same cgroup): cap the heap at 75% of the limit.
+    sized = int(limit_mb * 0.75)
+    if sized >= default_mb:
+        return default_mb
+    # Floor so a tiny limit doesn't drive V8 into constant GC. If the container
+    # is smaller than the floor, honor the limit-derived value anyway (better a
+    # graceful V8 exit than a silent cgroup kill).
+    return max(1536, sized) if limit_mb > 2048 else sized
+
+
 def _launch_tui(
    resume_session_id: Optional[str] = None,
    tui_dev: bool = False,
@ -1721,16 +1792,23 @@ def _launch_tui(
        env["HERMES_TUI_TOOL_PROGRESS"] = "off"
    if accept_hooks:
        env["HERMES_ACCEPT_HOOKS"] = "1"
-    # Guarantee an 8GB V8 heap for the TUI. Default node cap is ~1.5–4GB
+    # Guarantee a generous V8 heap for the TUI. Default node cap is ~1.5–4GB
    # depending on version and can fatal-OOM on long sessions with large
-    # transcripts / reasoning blobs. Token-level merge: respect any
+    # transcripts / reasoning blobs. We target 8GB on an unconstrained host,
+    # but V8 is NOT cgroup-aware: in a memory-limited Docker/k8s container a
+    # flat 8GB heap grows past the container limit and the cgroup OOM-killer
+    # SIGKILLs Node — running no JS handler, writing no breadcrumb, leaving the
+    # user with only a bare gateway `stdin EOF`. _resolve_tui_heap_mb() reads
+    # the real cgroup limit and sizes the cap below it so V8 GCs/exits
+    # gracefully (and the memory monitor's onCritical breadcrumb can fire)
+    # instead of being reaped silently. Token-level merge: respect any
    # user-supplied --max-old-space-size (they may have set it higher).
    # --expose-gc is *not* added here: Node rejects it in NODE_OPTIONS
    # ("--expose-gc is not allowed in NODE_OPTIONS") and refuses to start.
    # It is passed as a direct argv flag in _make_tui_argv() instead.
    _tokens = env.get("NODE_OPTIONS", "").split()
    if not any(t.startswith("--max-old-space-size=") for t in _tokens):
-        _tokens.append("--max-old-space-size=8192")
+        _tokens.append(f"--max-old-space-size={_resolve_tui_heap_mb()}")
    env["NODE_OPTIONS"] = " ".join(_tokens)
    # HERMES_TUI_RESUME is an internal hand-off from the Python wrapper to the
    # Ink app.  Because we start from os.environ.copy(), an exported/stale value
--- a/tests/hermes_cli/test_tui_heap_sizing.py
+++ b/tests/hermes_cli/test_tui_heap_sizing.py
@ -0,0 +1,123 @@
+"""Tests for cgroup-aware TUI V8 heap sizing.
+
+V8 is not cgroup-aware: a flat ``--max-old-space-size=8192`` lets the heap grow
+toward 8GB in a memory-limited container, so the cgroup OOM-killer SIGKILLs Node
+before V8's own monitor fires — leaving the user with only a bare gateway
+``stdin EOF`` and no breadcrumb. ``_resolve_tui_heap_mb`` reads the real cgroup
+limit and sizes the cap below it so V8 exits gracefully instead.
+"""
+
+import builtins
+import io
+from unittest import mock
+
+import hermes_cli.main as m
+
+V2 = "/sys/fs/cgroup/memory.max"
+V1 = "/sys/fs/cgroup/memory/memory.limit_in_bytes"
+GB = 1024 ** 3
+
+
+def _fake_open(files: dict):
+    """Return an open() shim serving cgroup paths from ``files`` (path->str)."""
+    real_open = builtins.open
+
+    def opener(path, *args, **kwargs):
+        if path in (V2, V1):
+            content = files.get(path)
+            if content is None:
+                raise FileNotFoundError(path)
+            return io.StringIO(content)
+        return real_open(path, *args, **kwargs)
+
+    return opener
+
+
+def _read(files: dict):
+    with mock.patch.object(builtins, "open", _fake_open(files)):
+        return m._read_cgroup_memory_limit()
+
+
+class TestReadCgroupMemoryLimit:
+    def test_v2_max_is_unlimited(self):
+        assert _read({V2: "max"}) is None
+
+    def test_v2_numeric_limit(self):
+        assert _read({V2: str(4 * GB)}) == 4 * GB
+
+    def test_v1_unlimited_sentinel_is_none(self):
+        # cgroup v1 reports "unlimited" as a near-INT64 huge value.
+        assert _read({V1: "9223372036854771712"}) is None
+
+    def test_v1_numeric_limit_when_no_v2(self):
+        assert _read({V1: str(2 * GB)}) == 2 * GB
+
+    def test_no_files_present(self):
+        assert _read({}) is None
+
+    def test_empty_v2_falls_through_to_v1(self):
+        # A blank v2 file must NOT be mistaken for "unlimited" — fall to v1.
+        assert _read({V2: "", V1: str(3 * GB)}) == 3 * GB
+
+    def test_v2_wins_over_v1(self):
+        assert _read({V2: str(6 * GB), V1: str(2 * GB)}) == 6 * GB
+
+    def test_zero_is_skipped(self):
+        assert _read({V2: "0"}) is None
+
+    def test_petabyte_plus_treated_as_unlimited(self):
+        assert _read({V2: str(1 << 51)}) is None
+
+
+class TestResolveTuiHeapMb:
+    def _resolve(self, limit_bytes):
+        with mock.patch.object(m, "_read_cgroup_memory_limit", return_value=limit_bytes):
+            return m._resolve_tui_heap_mb()
+
+    def test_unconstrained_uses_default(self):
+        assert self._resolve(None) == 8192
+
+    def test_large_container_clamps_to_default(self):
+        # 16GB -> 75% = 12288 >= 8192 -> clamp to 8192.
+        assert self._resolve(16 * GB) == 8192
+
+    def test_4gb_container_75_percent(self):
+        assert self._resolve(4 * GB) == 3072
+
+    def test_3gb_container_above_floor(self):
+        assert self._resolve(3 * GB) == 2304
+
+    def test_2gb_container_at_floor(self):
+        assert self._resolve(2 * GB) == 1536
+
+    def test_tiny_container_honors_limit_below_floor(self):
+        # 1GB -> 75% = 768; honored even though below the 1536 floor, because a
+        # graceful V8 exit beats a silent cgroup SIGKILL.
+        assert self._resolve(1 * GB) == 768
+
+    def test_never_exceeds_default(self):
+        assert self._resolve(64 * GB) == 8192
+
+
+class TestNodeOptionsTokenMerge:
+    """The _launch_tui token-merge block must add the sized cap unless the user
+    already supplied one, and must preserve unrelated NODE_OPTIONS flags."""
+
+    def _merge(self, node_options, limit_bytes):
+        with mock.patch.object(m, "_read_cgroup_memory_limit", return_value=limit_bytes):
+            tokens = node_options.split()
+            if not any(t.startswith("--max-old-space-size=") for t in tokens):
+                tokens.append(f"--max-old-space-size={m._resolve_tui_heap_mb()}")
+            return " ".join(tokens)
+
+    def test_unconstrained_empty(self):
+        assert self._merge("", None) == "--max-old-space-size=8192"
+
+    def test_constrained_container(self):
+        assert self._merge("", 4 * GB) == "--max-old-space-size=3072"
+
+    def test_user_override_respected(self):
+        assert self._merge("--max-old-space-size=12288", 2 * GB) == "--max-old-space-size=12288"
+
+    def test_preserves_other_flags(self):
+        assert self._merge("--enable-source-maps", 4 * GB) == "--enable-source-maps --max-old-space-size=3072"