fix(tui): stop persisting full tool output in trail lines (silent OOM death)

A heavy --tui session (browser snapshots, large tool outputs) silently OOM-killed the Node parent within minutes — closing the gateway child's stdin, which the user saw only as a bare "gateway exited" / stdin EOF. CLI was immune. Root cause: each completed tool's verbose trail line embedded up to 16KB of result_text, persisted in transcript Msg.tools[] for the whole session and rendered EXPANDED by default, so an Ink render-node tree was built for every one of up to 800 messages at once. That tree blew past Node's heap at a few hundred MB — far below the 2.5GB memory-monitor exit threshold, so the death was never even attributed. - text.ts: persisted verbose tool-trail blocks now cap to a small preview (VERBOSE_TRAIL_MAX_CHARS=800/12 lines), not the 16KB live-render budget. Retained trail strings drop ~17x (12.2MB -> 0.7MB at 800 msgs); the live streaming tail still uses the larger LIVE_RENDER budget. - tui_gateway/server.py: lower the gateway-side verbose text cap to match (1KB/16 lines) so we stop shipping output the TUI no longer renders. - memoryMonitor.ts: derive critical/high thresholds from the real V8 heap ceiling (~88%/70%) instead of the hardcoded 2.5GB that killed the process at 31% of an 8GB ceiling; add a one-shot onWarn early-warning on fast sub-threshold heap growth so the next such death is diagnosable, not silent. - entry.tsx: wire onWarn to a crash-log breadcrumb + stderr line. Full tool output is unchanged in the agent context and SQLite session — this is display/transport only, no behavior or context change. Fixes #34095. Related #27282. Tests: ui-tui text + new memoryMonitor suites (33 pass), python verbose-cap guard (5 pass); full ui-tui suite shows no new failures vs pristine main. E2E repro confirms the retention drop.
2026-06-03 06:00:22 -07:00
parent ba57ebec33
commit e76d8bf5aa
8 changed files with 251 additions and 9 deletions
--- a/tests/test_tui_gateway_server.py
+++ b/tests/test_tui_gateway_server.py
@ -133,6 +133,20 @@ def test_tui_verbose_tool_details_are_capped_before_emit(monkeypatch):
    assert "one" not in capped


+def test_tui_verbose_default_cap_stays_small(monkeypatch):
+    # Regression guard for #34095: the verbose tool text shipped to the TUI is
+    # rendered into a persisted, expanded-by-default trail block for the whole
+    # session. Raising this cap back toward the old 16KB re-introduces the Ink
+    # render-tree blowup that silently OOM-killed the TUI. Keep it small.
+    assert server._TUI_VERBOSE_TEXT_MAX_CHARS <= 2_000
+
+    huge = "x" * 40_000
+    capped = server._cap_tui_verbose_text(huge)
+
+    assert len(capped) < 2_000
+    assert capped.startswith("[showing verbose tail; omitted ")
+
+
 def test_tui_verbose_tool_events_omit_details_when_redaction_fails(monkeypatch):
    redact_module = types.ModuleType("agent.redact")

--- a/tui_gateway/server.py
+++ b/tui_gateway/server.py
@ -1657,8 +1657,15 @@ def _tool_ctx(name: str, args: dict) -> str:
        return ""


-_TUI_VERBOSE_TEXT_MAX_CHARS = 16_000
-_TUI_VERBOSE_TEXT_MAX_LINES = 240
+# Tool Args/Result text shipped to the TUI for the verbose trail line. The TUI
+# renders only a small persisted preview (ui-tui VERBOSE_TRAIL_MAX_CHARS), kept
+# all session and expanded by default — so shipping more than that is pure pipe
+# waste AND feeds the Ink render-tree blowup that silently OOM-killed the TUI
+# parent (#34095). Cap here to match the render budget (a hair more, so the
+# "[omitted …]" label is still informative when output is genuinely large).
+# Full output stays in the agent context and the SQLite session, untouched.
+_TUI_VERBOSE_TEXT_MAX_CHARS = 1_000
+_TUI_VERBOSE_TEXT_MAX_LINES = 16


 def _cap_tui_verbose_text(text: str) -> str:
--- a/ui-tui/src/tests/memoryMonitor.test.ts
+++ b/ui-tui/src/tests/memoryMonitor.test.ts
@ -0,0 +1,102 @@
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'
+
+// memory.js performs real heap dumps / fs work — stub it so the monitor's
+// dump path is a no-op in tests.
+vi.mock('../lib/memory.js', () => ({
+  performHeapDump: vi.fn(async () => null)
+}))
+
+// @hermes/ink is dynamically imported only on the dump path; stub the eviction.
+vi.mock('@hermes/ink', () => ({ evictInkCaches: vi.fn() }))
+
+import { startMemoryMonitor } from '../lib/memoryMonitor.js'
+
+const GB = 1024 ** 3
+const MB = 1024 ** 2
+
+describe('startMemoryMonitor thresholds (#34095)', () => {
+  let stop: (() => void) | undefined
+
+  beforeEach(() => {
+    vi.useFakeTimers()
+  })
+
+  afterEach(() => {
+    stop?.()
+    stop = undefined
+    vi.restoreAllMocks()
+    vi.useRealTimers()
+  })
+
+  const withHeap = (heapUsed: number, rss = heapUsed) =>
+    vi.spyOn(process, 'memoryUsage').mockReturnValue({
+      arrayBuffers: 0,
+      external: 0,
+      heapTotal: heapUsed,
+      heapUsed,
+      rss
+    } as NodeJS.MemoryUsage)
+
+  it('does NOT fire onCritical at 2.5GB when the heap ceiling is 8GB', async () => {
+    // The old hardcoded 2.5GB constant killed the process at ~31% of the real
+    // ceiling. With relative thresholds (~88%), 2.5GB is well within normal.
+    const onCritical = vi.fn()
+    withHeap(2.5 * GB)
+    stop = startMemoryMonitor({ intervalMs: 1, onCritical })
+
+    await vi.advanceTimersByTimeAsync(5)
+
+    expect(onCritical).not.toHaveBeenCalled()
+  })
+
+  it('fires onCritical only near the configured ceiling', async () => {
+    const onCritical = vi.fn()
+    // Explicit small ceiling-derived thresholds via override to keep the test
+    // independent of the host V8 heap_size_limit.
+    withHeap(7.5 * GB)
+    stop = startMemoryMonitor({ criticalBytes: 7 * GB, highBytes: 5 * GB, intervalMs: 1, onCritical })
+
+    await vi.advanceTimersByTimeAsync(5)
+
+    expect(onCritical).toHaveBeenCalledTimes(1)
+  })
+
+  it('fires onWarn once on fast sub-threshold heap growth, then re-arms', async () => {
+    const onWarn = vi.fn()
+    // Start low, then jump >150MB across a tick while above the 600MB floor and
+    // below `high` — the silent-death regime.
+    const spy = withHeap(100 * MB)
+    stop = startMemoryMonitor({ highBytes: 2 * GB, intervalMs: 1, onWarn, warnBytes: 600 * MB })
+
+    await vi.advanceTimersByTimeAsync(2) // seed lastHeap at 100MB, below floor
+    expect(onWarn).not.toHaveBeenCalled()
+
+    spy.mockReturnValue({ arrayBuffers: 0, external: 0, heapTotal: 800 * MB, heapUsed: 800 * MB, rss: 800 * MB } as NodeJS.MemoryUsage)
+    await vi.advanceTimersByTimeAsync(2) // jumped 700MB → above floor + steep
+    expect(onWarn).toHaveBeenCalledTimes(1)
+
+    // Stays elevated but not re-firing.
+    await vi.advanceTimersByTimeAsync(2)
+    expect(onWarn).toHaveBeenCalledTimes(1)
+
+    // Falls back below the floor → re-armed, then climbs again → fires again.
+    spy.mockReturnValue({ arrayBuffers: 0, external: 0, heapTotal: 100 * MB, heapUsed: 100 * MB, rss: 100 * MB } as NodeJS.MemoryUsage)
+    await vi.advanceTimersByTimeAsync(2)
+    spy.mockReturnValue({ arrayBuffers: 0, external: 0, heapTotal: 800 * MB, heapUsed: 800 * MB, rss: 800 * MB } as NodeJS.MemoryUsage)
+    await vi.advanceTimersByTimeAsync(2)
+    expect(onWarn).toHaveBeenCalledTimes(2)
+  })
+
+  it('does not warn on slow growth below the steep-growth step', async () => {
+    const onWarn = vi.fn()
+    const spy = withHeap(650 * MB)
+    stop = startMemoryMonitor({ highBytes: 2 * GB, intervalMs: 1, onWarn, warnBytes: 600 * MB })
+
+    await vi.advanceTimersByTimeAsync(2)
+    // +50MB per tick — above the floor but gentle, not a render-tree blowup.
+    spy.mockReturnValue({ arrayBuffers: 0, external: 0, heapTotal: 700 * MB, heapUsed: 700 * MB, rss: 700 * MB } as NodeJS.MemoryUsage)
+    await vi.advanceTimersByTimeAsync(2)
+
+    expect(onWarn).not.toHaveBeenCalled()
+  })
+})
--- a/ui-tui/src/tests/text.test.ts
+++ b/ui-tui/src/tests/text.test.ts
@ -69,6 +69,29 @@ describe('buildVerboseToolTrailLine', () => {
      mark: '✗'
    })
  })
+
+  it('caps a large result to a small persisted preview (#34095)', () => {
+    // A 40KB browser-snapshot-sized result must NOT be embedded whole — the
+    // persisted, expanded-by-default trail block is what blew up the Ink
+    // render tree and silently OOM-killed the TUI. The block stays small.
+    const huge = 'A'.repeat(40_000)
+    const line = buildVerboseToolTrailLine('browser_snapshot', 'https://x.example', false, 2, undefined, huge)
+
+    expect(line).toContain('Result:\n')
+    // Far below the old 16KB live-render budget; the whole line (call + label +
+    // omitted marker + preview) must stay on the order of ~1KB, not ~40KB.
+    expect(line.length).toBeLessThan(2_000)
+    expect(line).toContain('omitted')
+    expect(line.endsWith(' ✓')).toBe(true)
+  })
+
+  it('does not truncate a result that already fits the preview budget', () => {
+    const small = 'ok: 3 files changed'
+    const line = buildVerboseToolTrailLine('patch', 'index.html', false, 0.1, undefined, small)
+
+    expect(line).toContain(`Result:\n${small}`)
+    expect(line).not.toContain('omitted')
+  })
 })

 describe('lastCotTrailIndex', () => {
--- a/ui-tui/src/config/limits.ts
+++ b/ui-tui/src/config/limits.ts
@ -3,6 +3,19 @@ export const LARGE_PASTE = { lines: 5 }
 export const LIVE_RENDER_MAX_CHARS = 16_000
 export const LIVE_RENDER_MAX_LINES = 240

+// Persisted verbose tool-trail blocks (Args/Result embedded in a completed
+// tool line) are kept for the WHOLE session in transcript Msg.tools[] and
+// rendered expanded by default, so a render-node tree is built for every one
+// of up to MAX_HISTORY messages at once. Capping these to the live-render
+// budget (16KB) let a heavy browser/large-output session retain ~12MB of
+// strings that exploded into a few hundred MB of Ink nodes and silently OOM-
+// killed the Node parent (→ stdin EOF, gateway death; issue #34095). The live
+// streaming tail still uses the larger LIVE_RENDER budget — only the persisted
+// per-call block shrinks to a readable preview here. Full output remains in the
+// agent context and the SQLite session; the trail is a glance, not a log.
+export const VERBOSE_TRAIL_MAX_CHARS = 800
+export const VERBOSE_TRAIL_MAX_LINES = 12
+
 export const LONG_MSG = 300
 export const MAX_HISTORY = 800
 export const THINKING_COT_MAX = 160
--- a/ui-tui/src/entry.tsx
+++ b/ui-tui/src/entry.tsx
@ -75,7 +75,17 @@ const stopMemoryMonitor = startMemoryMonitor({
    process.stderr.write('hermes-tui: exiting to avoid OOM; restart to recover\n')
    process.exit(137)
  },
-  onHigh: (snap, dump) => process.stderr.write(dumpNotice(snap, dump))
+  onHigh: (snap, dump) => process.stderr.write(dumpNotice(snap, dump)),
+  // Sub-threshold abnormal heap growth (#34095). The TUI used to die silently
+  // here — Node OOMs from a render-tree blowup well below the exit threshold,
+  // so the only trace was a bare gateway `stdin EOF`. Persist a breadcrumb +
+  // stderr line so the next such death is attributable instead of silent.
+  onWarn: snap => {
+    recordParentLifecycle(`memory-warning fast heap growth heap=${formatBytes(snap.heapUsed)} rss=${formatBytes(snap.rss)}`)
+    process.stderr.write(
+      `hermes-tui: heap climbing fast (${formatBytes(snap.heapUsed)}) — a large tool output or long session may be straining memory\n`
+    )
+  }
 })

 if (process.env.HERMES_HEAPDUMP_ON_START === '1') {
--- a/ui-tui/src/lib/memoryMonitor.ts
+++ b/ui-tui/src/lib/memoryMonitor.ts
@ -1,3 +1,5 @@
+import { getHeapStatistics } from 'node:v8'
+
 import { type HeapDumpResult, performHeapDump } from './memory.js'

 export type MemoryLevel = 'critical' | 'high' | 'normal'
@ -14,9 +16,41 @@ export interface MemoryMonitorOptions {
  intervalMs?: number
  onCritical?: (snap: MemorySnapshot, dump: HeapDumpResult | null) => void
  onHigh?: (snap: MemorySnapshot, dump: HeapDumpResult | null) => void
+  // Fired ONCE when heap growth looks abnormal while still far below the
+  // critical exit threshold — the regime where the TUI used to die silently
+  // (#34095: Node OOMs from an Ink render-tree blowup at a few hundred MB,
+  // well under criticalBytes, so onCritical never fired and the gateway death
+  // showed up only as a bare `stdin EOF`). A visible warning here makes that
+  // class of death diagnosable instead of silent.
+  onWarn?: (snap: MemorySnapshot) => void
+  warnBytes?: number
 }

 const GB = 1024 ** 3
+const MB = 1024 ** 2
+
+// Resolve the exit / dump thresholds RELATIVE to the actual V8 heap ceiling
+// (--max-old-space-size, 8GB for the TUI) instead of hardcoding 2.5GB. The old
+// constant killed the process — and silently closed the gateway's stdin — at
+// ~31% of an 8GB ceiling, treating a normal long-session heap as an OOM. We now
+// exit only when genuinely near the ceiling (critical ~88%, high ~70%), and
+// clamp to sane floors/ceilings so a tiny --max-old-space-size can't drive the
+// thresholds below the warn watermark. Callers may still override explicitly.
+function resolveThresholds(criticalBytes?: number, highBytes?: number) {
+  let limit = 0
+  try {
+    limit = getHeapStatistics().heap_size_limit || 0
+  } catch {
+    limit = 0
+  }
+
+  // Fall back to the historical 8GB ceiling if V8 doesn't report one.
+  const ceiling = limit > 0 ? limit : 8 * GB
+  const critical = criticalBytes ?? Math.max(2 * GB, Math.round(ceiling * 0.88))
+  const high = highBytes ?? Math.max(1 * GB, Math.min(critical - 256 * MB, Math.round(ceiling * 0.7)))
+
+  return { critical, high }
+}

 // Deferred @hermes/ink import: loading `@hermes/ink` at module top-level
 // pulls the full ~414KB Ink bundle (React, renderer, components, hooks) onto
@ -53,18 +87,46 @@ async function _ensureEvictInkCaches(): Promise<(level: 'all' | 'half') => unkno
 }

 export function startMemoryMonitor({
-  criticalBytes = 2.5 * GB,
-  highBytes = 1.5 * GB,
+  criticalBytes,
+  highBytes,
  intervalMs = 10_000,
  onCritical,
-  onHigh
+  onHigh,
+  onWarn,
+  warnBytes = 600 * MB
 }: MemoryMonitorOptions = {}): () => void {
+  const { critical, high } = resolveThresholds(criticalBytes, highBytes)
  const dumped = new Set<Exclude<MemoryLevel, 'normal'>>()
  const inFlight = new Set<Exclude<MemoryLevel, 'normal'>>()

+  // Early-warning state (#34095): the silent-death regime is BELOW `high`, so
+  // the level machine above never sees it. Track the previous sample and fire
+  // onWarn at most once when heap both crosses a modest absolute floor AND is
+  // climbing steeply (≥150MB between 10s ticks) — the signature of a render-
+  // tree blowup — so the user gets a visible heads-up before Node OOMs under
+  // the exit threshold. Re-armed only after heap falls back below the floor.
+  // `lastHeap < 0` marks the un-seeded first sample so a cold start that opens
+  // already-high can't be mistaken for sudden growth (growth = current - last).
+  let lastHeap = -1
+  let warned = false
+  const WARN_GROWTH_STEP = 150 * MB
+
  const tick = async () => {
    const { heapUsed, rss } = process.memoryUsage()
-    const level: MemoryLevel = heapUsed >= criticalBytes ? 'critical' : heapUsed >= highBytes ? 'high' : 'normal'
+
+    // Sub-threshold abnormal-growth warning. Skip on the first (un-seeded)
+    // sample — we need a prior reading to measure a delta against.
+    if (heapUsed < high && lastHeap >= 0) {
+      if (!warned && heapUsed >= warnBytes && heapUsed - lastHeap >= WARN_GROWTH_STEP) {
+        warned = true
+        onWarn?.({ heapUsed, level: 'normal', rss })
+      } else if (heapUsed < warnBytes) {
+        warned = false
+      }
+    }
+    lastHeap = heapUsed
+
+    const level: MemoryLevel = heapUsed >= critical ? 'critical' : heapUsed >= high ? 'high' : 'normal'

    if (level === 'normal') {
      dumped.clear()
--- a/ui-tui/src/lib/text.ts
+++ b/ui-tui/src/lib/text.ts
@ -1,7 +1,9 @@
 import {
  LIVE_RENDER_MAX_CHARS,
  LIVE_RENDER_MAX_LINES,
-  THINKING_COT_MAX
+  THINKING_COT_MAX,
+  VERBOSE_TRAIL_MAX_CHARS,
+  VERBOSE_TRAIL_MAX_LINES
 } from '../config/limits.js'
 import { VERBS } from '../content/verbs.js'
 import type { ThinkingMode } from '../types.js'
@ -215,7 +217,16 @@ export const buildToolTrailLine = (
 const verboseToolBlock = (label: string, text?: string) => {
  const body = (text ?? '').trim()

-  return body ? `${label}:\n${boundedLiveRenderText(body)}` : ''
+  // Persisted trail blocks are kept all session and rendered expanded by
+  // default — cap to a small readable preview (NOT the 16KB live-render
+  // budget) so a large tool output can't balloon the Ink render tree and
+  // silently OOM-kill the TUI. See VERBOSE_TRAIL_MAX_CHARS (#34095).
+  return body
+    ? `${label}:\n${boundedLiveRenderText(body, {
+        maxChars: VERBOSE_TRAIL_MAX_CHARS,
+        maxLines: VERBOSE_TRAIL_MAX_LINES
+      })}`
+    : ''
 }

 export const buildVerboseToolTrailLine = (