fix(tui): cgroup-aware V8 heap cap so memory-limited containers stop dying silently (#38541)
The TUI hardcoded --max-old-space-size=8192. V8 is not cgroup-aware, so in a Docker/k8s container capped below ~9-10GB the heap grows past the container limit and the cgroup OOM-killer SIGKILLs the Node parent BEFORE V8's own heap monitor fires. SIGKILL runs no JS handler, writes no [tui-parent] breadcrumb, and closes the gateway child's stdin — the user sees only a bare gateway 'stdin EOF'. Complements #38224 (trail-text cap), which reduced pressure but left the 8GB-vs-container mismatch in place. - _read_cgroup_memory_limit(): read cgroup v2 (memory.max) then v1 (memory.limit_in_bytes); handle 'max', the v1 unlimited sentinel, blank/zero, and >=1PB as unconstrained. - _resolve_tui_heap_mb(): unconstrained -> 8192; constrained -> 75% of the cgroup limit (headroom for non-heap RSS + the Python child sharing the cgroup), floored at 1536MB, never above 8192. - NODE_OPTIONS block uses the sized value; still respects a user-supplied --max-old-space-size. Net: V8 now GCs/exits gracefully (onCritical breadcrumb fires) instead of being reaped silently. Display/transport only — no agent context or behavior change. Tests: tests/hermes_cli/test_tui_heap_sizing.py (20 tests).
This commit is contained in:
@ -1626,6 +1626,77 @@ def _normalize_tui_toolsets(toolsets: object) -> list[str]:
|
||||
return [item for item in normalized if item]
|
||||
|
||||
|
||||
def _read_cgroup_memory_limit() -> Optional[int]:
|
||||
"""Return the container memory limit in bytes, or None if unconstrained.
|
||||
|
||||
Node's V8 heap is NOT cgroup-aware: with a flat ``--max-old-space-size=8192``
|
||||
it happily grows the heap toward 8GB regardless of the container's real
|
||||
memory limit. In a Docker/k8s container capped below ~9-10GB, the cgroup
|
||||
OOM-killer SIGKILLs Node before V8's own heap monitor ever fires — which
|
||||
runs no JS handler, writes no ``[tui-parent]`` breadcrumb, and the user
|
||||
sees only a bare gateway ``stdin EOF``. Reading the real cgroup limit lets
|
||||
us size the heap cap below it so V8 GCs/exits gracefully instead of being
|
||||
reaped silently.
|
||||
|
||||
Checks cgroup v2 (``/sys/fs/cgroup/memory.max``) then v1
|
||||
(``/sys/fs/cgroup/memory/memory.limit_in_bytes``). A literal ``max`` (v2)
|
||||
or the v1 "unlimited" sentinel (a huge near-INT64 value) means no limit.
|
||||
"""
|
||||
candidates = (
|
||||
"/sys/fs/cgroup/memory.max", # cgroup v2
|
||||
"/sys/fs/cgroup/memory/memory.limit_in_bytes", # cgroup v1
|
||||
)
|
||||
for path in candidates:
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
raw = f.read().strip()
|
||||
except (OSError, ValueError):
|
||||
continue
|
||||
if raw == "max":
|
||||
return None
|
||||
if not raw:
|
||||
# Blank/empty file: no usable value here. Fall through to the next
|
||||
# candidate (don't mistake an empty v2 file for "unlimited").
|
||||
continue
|
||||
try:
|
||||
limit = int(raw)
|
||||
except ValueError:
|
||||
continue
|
||||
if limit <= 0:
|
||||
continue
|
||||
# cgroup v1 reports "unlimited" as a huge value (often
|
||||
# 0x7FFFFFFFFFFFF000 ≈ 9.2 EB, sometimes PAGE_COUNTER_MAX). Anything
|
||||
# at/above ~1 PB is effectively unconstrained — treat as no limit.
|
||||
if limit >= (1 << 50):
|
||||
return None
|
||||
return limit
|
||||
return None
|
||||
|
||||
|
||||
def _resolve_tui_heap_mb(default_mb: int = 8192) -> int:
|
||||
"""Pick a V8 ``--max-old-space-size`` (MB) that fits the container.
|
||||
|
||||
Returns ``default_mb`` (8192) when unconstrained or when the box is large
|
||||
enough that 8GB fits. In a memory-limited container, returns ~75% of the
|
||||
cgroup limit so the heap + non-heap RSS stays under the cgroup ceiling,
|
||||
clamped to a sane floor (1536MB — below this V8 GC-thrashes and the TUI
|
||||
is barely usable). Never exceeds ``default_mb``.
|
||||
"""
|
||||
limit = _read_cgroup_memory_limit()
|
||||
if not limit:
|
||||
return default_mb
|
||||
limit_mb = limit // (1024 * 1024)
|
||||
# Leave headroom for non-heap RSS (Node internals, buffers, the Python
|
||||
# gateway child shares the same cgroup): cap the heap at 75% of the limit.
|
||||
sized = int(limit_mb * 0.75)
|
||||
if sized >= default_mb:
|
||||
return default_mb
|
||||
# Floor so a tiny limit doesn't drive V8 into constant GC. If the container
|
||||
# is smaller than the floor, honor the limit-derived value anyway (better a
|
||||
# graceful V8 exit than a silent cgroup kill).
|
||||
return max(1536, sized) if limit_mb > 2048 else sized
|
||||
|
||||
|
||||
def _launch_tui(
|
||||
resume_session_id: Optional[str] = None,
|
||||
tui_dev: bool = False,
|
||||
@ -1721,16 +1792,23 @@ def _launch_tui(
|
||||
env["HERMES_TUI_TOOL_PROGRESS"] = "off"
|
||||
if accept_hooks:
|
||||
env["HERMES_ACCEPT_HOOKS"] = "1"
|
||||
# Guarantee an 8GB V8 heap for the TUI. Default node cap is ~1.5–4GB
|
||||
# Guarantee a generous V8 heap for the TUI. Default node cap is ~1.5–4GB
|
||||
# depending on version and can fatal-OOM on long sessions with large
|
||||
# transcripts / reasoning blobs. Token-level merge: respect any
|
||||
# transcripts / reasoning blobs. We target 8GB on an unconstrained host,
|
||||
# but V8 is NOT cgroup-aware: in a memory-limited Docker/k8s container a
|
||||
# flat 8GB heap grows past the container limit and the cgroup OOM-killer
|
||||
# SIGKILLs Node — running no JS handler, writing no breadcrumb, leaving the
|
||||
# user with only a bare gateway `stdin EOF`. _resolve_tui_heap_mb() reads
|
||||
# the real cgroup limit and sizes the cap below it so V8 GCs/exits
|
||||
# gracefully (and the memory monitor's onCritical breadcrumb can fire)
|
||||
# instead of being reaped silently. Token-level merge: respect any
|
||||
# user-supplied --max-old-space-size (they may have set it higher).
|
||||
# --expose-gc is *not* added here: Node rejects it in NODE_OPTIONS
|
||||
# ("--expose-gc is not allowed in NODE_OPTIONS") and refuses to start.
|
||||
# It is passed as a direct argv flag in _make_tui_argv() instead.
|
||||
_tokens = env.get("NODE_OPTIONS", "").split()
|
||||
if not any(t.startswith("--max-old-space-size=") for t in _tokens):
|
||||
_tokens.append("--max-old-space-size=8192")
|
||||
_tokens.append(f"--max-old-space-size={_resolve_tui_heap_mb()}")
|
||||
env["NODE_OPTIONS"] = " ".join(_tokens)
|
||||
# HERMES_TUI_RESUME is an internal hand-off from the Python wrapper to the
|
||||
# Ink app. Because we start from os.environ.copy(), an exported/stale value
|
||||
|
||||
123
tests/hermes_cli/test_tui_heap_sizing.py
Normal file
123
tests/hermes_cli/test_tui_heap_sizing.py
Normal file
@ -0,0 +1,123 @@
|
||||
"""Tests for cgroup-aware TUI V8 heap sizing.
|
||||
|
||||
V8 is not cgroup-aware: a flat ``--max-old-space-size=8192`` lets the heap grow
|
||||
toward 8GB in a memory-limited container, so the cgroup OOM-killer SIGKILLs Node
|
||||
before V8's own monitor fires — leaving the user with only a bare gateway
|
||||
``stdin EOF`` and no breadcrumb. ``_resolve_tui_heap_mb`` reads the real cgroup
|
||||
limit and sizes the cap below it so V8 exits gracefully instead.
|
||||
"""
|
||||
|
||||
import builtins
|
||||
import io
|
||||
from unittest import mock
|
||||
|
||||
import hermes_cli.main as m
|
||||
|
||||
V2 = "/sys/fs/cgroup/memory.max"
|
||||
V1 = "/sys/fs/cgroup/memory/memory.limit_in_bytes"
|
||||
GB = 1024 ** 3
|
||||
|
||||
|
||||
def _fake_open(files: dict):
|
||||
"""Return an open() shim serving cgroup paths from ``files`` (path->str)."""
|
||||
real_open = builtins.open
|
||||
|
||||
def opener(path, *args, **kwargs):
|
||||
if path in (V2, V1):
|
||||
content = files.get(path)
|
||||
if content is None:
|
||||
raise FileNotFoundError(path)
|
||||
return io.StringIO(content)
|
||||
return real_open(path, *args, **kwargs)
|
||||
|
||||
return opener
|
||||
|
||||
|
||||
def _read(files: dict):
|
||||
with mock.patch.object(builtins, "open", _fake_open(files)):
|
||||
return m._read_cgroup_memory_limit()
|
||||
|
||||
|
||||
class TestReadCgroupMemoryLimit:
|
||||
def test_v2_max_is_unlimited(self):
|
||||
assert _read({V2: "max"}) is None
|
||||
|
||||
def test_v2_numeric_limit(self):
|
||||
assert _read({V2: str(4 * GB)}) == 4 * GB
|
||||
|
||||
def test_v1_unlimited_sentinel_is_none(self):
|
||||
# cgroup v1 reports "unlimited" as a near-INT64 huge value.
|
||||
assert _read({V1: "9223372036854771712"}) is None
|
||||
|
||||
def test_v1_numeric_limit_when_no_v2(self):
|
||||
assert _read({V1: str(2 * GB)}) == 2 * GB
|
||||
|
||||
def test_no_files_present(self):
|
||||
assert _read({}) is None
|
||||
|
||||
def test_empty_v2_falls_through_to_v1(self):
|
||||
# A blank v2 file must NOT be mistaken for "unlimited" — fall to v1.
|
||||
assert _read({V2: "", V1: str(3 * GB)}) == 3 * GB
|
||||
|
||||
def test_v2_wins_over_v1(self):
|
||||
assert _read({V2: str(6 * GB), V1: str(2 * GB)}) == 6 * GB
|
||||
|
||||
def test_zero_is_skipped(self):
|
||||
assert _read({V2: "0"}) is None
|
||||
|
||||
def test_petabyte_plus_treated_as_unlimited(self):
|
||||
assert _read({V2: str(1 << 51)}) is None
|
||||
|
||||
|
||||
class TestResolveTuiHeapMb:
|
||||
def _resolve(self, limit_bytes):
|
||||
with mock.patch.object(m, "_read_cgroup_memory_limit", return_value=limit_bytes):
|
||||
return m._resolve_tui_heap_mb()
|
||||
|
||||
def test_unconstrained_uses_default(self):
|
||||
assert self._resolve(None) == 8192
|
||||
|
||||
def test_large_container_clamps_to_default(self):
|
||||
# 16GB -> 75% = 12288 >= 8192 -> clamp to 8192.
|
||||
assert self._resolve(16 * GB) == 8192
|
||||
|
||||
def test_4gb_container_75_percent(self):
|
||||
assert self._resolve(4 * GB) == 3072
|
||||
|
||||
def test_3gb_container_above_floor(self):
|
||||
assert self._resolve(3 * GB) == 2304
|
||||
|
||||
def test_2gb_container_at_floor(self):
|
||||
assert self._resolve(2 * GB) == 1536
|
||||
|
||||
def test_tiny_container_honors_limit_below_floor(self):
|
||||
# 1GB -> 75% = 768; honored even though below the 1536 floor, because a
|
||||
# graceful V8 exit beats a silent cgroup SIGKILL.
|
||||
assert self._resolve(1 * GB) == 768
|
||||
|
||||
def test_never_exceeds_default(self):
|
||||
assert self._resolve(64 * GB) == 8192
|
||||
|
||||
|
||||
class TestNodeOptionsTokenMerge:
|
||||
"""The _launch_tui token-merge block must add the sized cap unless the user
|
||||
already supplied one, and must preserve unrelated NODE_OPTIONS flags."""
|
||||
|
||||
def _merge(self, node_options, limit_bytes):
|
||||
with mock.patch.object(m, "_read_cgroup_memory_limit", return_value=limit_bytes):
|
||||
tokens = node_options.split()
|
||||
if not any(t.startswith("--max-old-space-size=") for t in tokens):
|
||||
tokens.append(f"--max-old-space-size={m._resolve_tui_heap_mb()}")
|
||||
return " ".join(tokens)
|
||||
|
||||
def test_unconstrained_empty(self):
|
||||
assert self._merge("", None) == "--max-old-space-size=8192"
|
||||
|
||||
def test_constrained_container(self):
|
||||
assert self._merge("", 4 * GB) == "--max-old-space-size=3072"
|
||||
|
||||
def test_user_override_respected(self):
|
||||
assert self._merge("--max-old-space-size=12288", 2 * GB) == "--max-old-space-size=12288"
|
||||
|
||||
def test_preserves_other_flags(self):
|
||||
assert self._merge("--enable-source-maps", 4 * GB) == "--enable-source-maps --max-old-space-size=3072"
|
||||
Reference in New Issue
Block a user