feat(dashboard): add Debug Share to the System page (#38600)
* Port from google-gemini/gemini-cli#21541: back up corrupted config.yaml When config.yaml fails to parse, load_config() silently falls back to DEFAULT_CONFIG and leaves the broken file on disk. If the user then re-runs the setup wizard or hermes config set (both rewrite config.yaml), their broken-but-recoverable overrides are lost for good. Adapts the policy-file recovery from gemini-cli#21541: on the first parse warning for a given broken file, snapshot it to config.yaml.corrupt.<ts>.bak (best-effort, symlink-guarded, size-deduped) and tell the user where it landed. Unlike Gemini's version we deliberately do NOT reset config.yaml to a clean state — hermes never silently mutates user config, and leaving it means a hand-fixed file is re-read on the next load. Tests: 3 new cases (backup created + content preserved + original untouched; same-size backup dedup; symlink not copied). E2E verified with isolated HERMES_HOME and a real tab-indented broken config. * feat(dashboard): add Debug Share to the System page Surface `hermes debug share` in the dashboard. The System > Operations section gets a dedicated card that uploads a redacted report + full logs and returns the paste URLs as real, copyable links instead of a log tail. - debug.py: factor a pure build_debug_share() returning structured {urls, failures, redacted, auto_delete_seconds}; run_debug_share now calls it (CLI output unchanged). - web_server.py: POST /api/ops/debug-share runs the share core in a worker thread and returns the structured payload synchronously (the URLs are the whole point — not a backgrounded action). - api.ts: runDebugShare() + DebugShareResponse. - SystemPage.tsx: share card with a redaction toggle (on by default), per-link + copy-all buttons, and the 6h auto-delete countdown. - tests: build_debug_share core + endpoint (redact toggle, failure 502, token gate).
This commit is contained in:
@ -17,11 +17,13 @@ import logging
|
||||
import os
|
||||
import platform
|
||||
import re
|
||||
import shutil
|
||||
import stat
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import threading
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional, List, Tuple
|
||||
@ -36,6 +38,60 @@ logger = logging.getLogger(__name__)
|
||||
_CONFIG_PARSE_WARNED: set = set()
|
||||
|
||||
|
||||
def _backup_corrupt_config(config_path: Path) -> Optional[Path]:
|
||||
"""Preserve a corrupted ``config.yaml`` by copying it to a timestamped ``.bak``.
|
||||
|
||||
When the YAML can't be parsed, ``load_config()`` silently falls back to
|
||||
``DEFAULT_CONFIG`` and the user's broken file stays on disk untouched.
|
||||
That file is still the user's only copy of their intended overrides — if
|
||||
they re-run the setup wizard or ``hermes config set`` (which rewrites
|
||||
``config.yaml``), the broken-but-recoverable content is gone for good.
|
||||
|
||||
This snapshots the corrupted file to ``config.yaml.corrupt.<ts>.bak`` so
|
||||
the user can diff/repair it. Unlike Gemini CLI's policy-file recovery
|
||||
(which resets the live file to a clean state), we deliberately leave
|
||||
``config.yaml`` in place: hermes never silently mutates the user's config,
|
||||
and leaving it means a hand-fixed file is re-read on the next load. The
|
||||
backup is best-effort — any failure (permissions, symlink, disk full) is
|
||||
swallowed so config loading is never blocked by backup problems.
|
||||
|
||||
Returns the backup path on success, else ``None``. Symlinks are not
|
||||
followed/copied (mirrors the Gemini #21541 lstat guard) to avoid
|
||||
clobbering whatever a malicious/misconfigured symlink points at.
|
||||
"""
|
||||
try:
|
||||
if config_path.is_symlink():
|
||||
return None
|
||||
st = config_path.stat()
|
||||
if st.st_size == 0:
|
||||
# Empty file isn't worth preserving and yaml.safe_load returns {}
|
||||
# for it anyway (so it wouldn't reach here), but guard regardless.
|
||||
return None
|
||||
ts = time.strftime("%Y%m%d-%H%M%S")
|
||||
backup_path = config_path.with_name(f"{config_path.name}.corrupt.{ts}.bak")
|
||||
# Don't clobber an existing backup from the same second; if there's
|
||||
# already a corrupt backup for this exact mtime, assume we've snapshotted
|
||||
# this corruption already and skip (the dedup cache normally prevents a
|
||||
# second call, but a process restart can clear it).
|
||||
sibling_baks = list(
|
||||
config_path.parent.glob(f"{config_path.name}.corrupt.*.bak")
|
||||
)
|
||||
for existing in sibling_baks:
|
||||
try:
|
||||
if existing.stat().st_size == st.st_size:
|
||||
# Same size as the current broken file — likely the same
|
||||
# corruption already preserved. Avoid backup churn.
|
||||
return None
|
||||
except OSError:
|
||||
continue
|
||||
if backup_path.exists():
|
||||
return None
|
||||
shutil.copy2(config_path, backup_path)
|
||||
return backup_path
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _warn_config_parse_failure(config_path: Path, exc: Exception) -> None:
|
||||
"""Surface a config.yaml parse failure to user, log, and stderr.
|
||||
|
||||
@ -48,7 +104,11 @@ def _warn_config_parse_failure(config_path: Path, exc: Exception) -> None:
|
||||
Now: warn once per (path, mtime_ns, size) on stderr **and** in
|
||||
``agent.log`` / ``errors.log`` at WARNING level so ``hermes logs``
|
||||
surfaces it. Re-warns automatically if the file changes (different
|
||||
mtime/size), so users editing the config see the next failure.
|
||||
mtime/size), so users editing the config see the next failure. On the
|
||||
first warning for a given broken file we also snapshot it to a
|
||||
timestamped ``.bak`` (best-effort) so the user's recoverable content
|
||||
survives any later rewrite of ``config.yaml`` by the setup wizard or
|
||||
``hermes config set``.
|
||||
"""
|
||||
try:
|
||||
st = config_path.stat()
|
||||
@ -59,12 +119,16 @@ def _warn_config_parse_failure(config_path: Path, exc: Exception) -> None:
|
||||
return
|
||||
_CONFIG_PARSE_WARNED.add(key)
|
||||
|
||||
backup_path = _backup_corrupt_config(config_path)
|
||||
|
||||
msg = (
|
||||
f"Failed to parse {config_path}: {exc}. "
|
||||
f"Falling back to default config — every user override "
|
||||
f"(auxiliary providers, fallback chain, model settings) is being IGNORED. "
|
||||
f"Fix the YAML and restart."
|
||||
)
|
||||
if backup_path is not None:
|
||||
msg += f" A copy of the corrupted file was saved to {backup_path}."
|
||||
logger.warning(msg)
|
||||
try:
|
||||
sys.stderr.write(f"⚠️ hermes config: {msg}\n")
|
||||
|
||||
@ -585,20 +585,41 @@ def collect_debug_report(
|
||||
# CLI entry points
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def run_debug_share(args):
|
||||
"""Collect debug report + full logs, upload each, print URLs."""
|
||||
@dataclass
|
||||
class DebugShareResult:
|
||||
"""Structured outcome of a ``debug share`` upload.
|
||||
|
||||
Returned by :func:`build_debug_share` so non-CLI callers (the dashboard
|
||||
web server, gateway) can render the uploaded paste URLs as real links
|
||||
instead of scraping printed text.
|
||||
"""
|
||||
|
||||
urls: dict # label -> paste URL (e.g. {"Report": "...", "agent.log": "..."})
|
||||
failures: list # human-readable "label: error" strings for optional uploads
|
||||
redacted: bool # whether force-mode redaction was applied before upload
|
||||
auto_delete_seconds: int # how long until the pastes auto-delete
|
||||
report: str = "" # the summary report text (kept for local fallback)
|
||||
|
||||
|
||||
def build_debug_share(
|
||||
*,
|
||||
log_lines: int = 200,
|
||||
expiry: int = 7,
|
||||
redact: bool = True,
|
||||
) -> DebugShareResult:
|
||||
"""Collect the debug report + full logs, upload each, return the URLs.
|
||||
|
||||
This is the shared core behind ``hermes debug share`` (CLI) and the
|
||||
dashboard ``POST /api/ops/debug-share`` endpoint. It performs blocking
|
||||
network I/O (paste uploads) — callers inside an event loop must run it in
|
||||
a worker thread.
|
||||
|
||||
The summary report upload is required: on failure this raises
|
||||
``RuntimeError``. Full-log uploads are best-effort; their errors are
|
||||
collected into ``failures`` rather than raised.
|
||||
"""
|
||||
_best_effort_sweep_expired_pastes()
|
||||
|
||||
log_lines = getattr(args, "lines", 200)
|
||||
expiry = getattr(args, "expire", 7)
|
||||
local_only = getattr(args, "local", False)
|
||||
redact = not getattr(args, "no_redact", False)
|
||||
|
||||
if not local_only:
|
||||
print(_PRIVACY_NOTICE)
|
||||
|
||||
print("Collecting debug report...")
|
||||
|
||||
# Capture dump once — prepended to every paste for context.
|
||||
# The dump is already redacted at extract time via dump.py:_redact;
|
||||
# log_snapshots are redacted by _capture_default_log_snapshots when
|
||||
@ -639,71 +660,112 @@ def run_debug_share(args):
|
||||
if desktop_log:
|
||||
desktop_log = _REDACTION_BANNER + desktop_log
|
||||
|
||||
if local_only:
|
||||
print(report)
|
||||
if agent_log:
|
||||
print(f"\n\n{'=' * 60}")
|
||||
print("FULL agent.log")
|
||||
print(f"{'=' * 60}\n")
|
||||
print(agent_log)
|
||||
if gateway_log:
|
||||
print(f"\n\n{'=' * 60}")
|
||||
print("FULL gateway.log")
|
||||
print(f"{'=' * 60}\n")
|
||||
print(gateway_log)
|
||||
if desktop_log:
|
||||
print(f"\n\n{'=' * 60}")
|
||||
print("FULL desktop.log")
|
||||
print(f"{'=' * 60}\n")
|
||||
print(desktop_log)
|
||||
return
|
||||
|
||||
print("Uploading...")
|
||||
urls: dict[str, str] = {}
|
||||
failures: list[str] = []
|
||||
|
||||
# 1. Summary report (required)
|
||||
# 1. Summary report (required — raises on failure so callers can fall back)
|
||||
urls["Report"] = upload_to_pastebin(report, expiry_days=expiry)
|
||||
|
||||
# 2-4. Full logs (optional — failures are collected, not raised)
|
||||
for label, content in (
|
||||
("agent.log", agent_log),
|
||||
("gateway.log", gateway_log),
|
||||
("desktop.log", desktop_log),
|
||||
):
|
||||
if not content:
|
||||
continue
|
||||
try:
|
||||
urls[label] = upload_to_pastebin(content, expiry_days=expiry)
|
||||
except Exception as exc:
|
||||
failures.append(f"{label}: {exc}")
|
||||
|
||||
# Schedule auto-deletion after 6 hours.
|
||||
_schedule_auto_delete(list(urls.values()))
|
||||
|
||||
return DebugShareResult(
|
||||
urls=urls,
|
||||
failures=failures,
|
||||
redacted=redact,
|
||||
auto_delete_seconds=_AUTO_DELETE_SECONDS,
|
||||
report=report,
|
||||
)
|
||||
|
||||
|
||||
def run_debug_share(args):
|
||||
"""Collect debug report + full logs, upload each, print URLs."""
|
||||
log_lines = getattr(args, "lines", 200)
|
||||
expiry = getattr(args, "expire", 7)
|
||||
local_only = getattr(args, "local", False)
|
||||
redact = not getattr(args, "no_redact", False)
|
||||
|
||||
if local_only:
|
||||
# Local-only path never uploads — render the report to stdout and bail
|
||||
# before any network I/O. Mirrors the upload path's collection logic.
|
||||
_best_effort_sweep_expired_pastes()
|
||||
print("Collecting debug report...")
|
||||
dump_text = _capture_dump()
|
||||
log_snapshots = _capture_default_log_snapshots(log_lines, redact=redact)
|
||||
report = collect_debug_report(
|
||||
log_lines=log_lines,
|
||||
dump_text=dump_text,
|
||||
log_snapshots=log_snapshots,
|
||||
)
|
||||
agent_log = log_snapshots["agent"].full_text
|
||||
gateway_log = log_snapshots["gateway"].full_text
|
||||
desktop_log = log_snapshots["desktop"].full_text
|
||||
if agent_log:
|
||||
agent_log = dump_text + "\n\n--- full agent.log ---\n" + agent_log
|
||||
if gateway_log:
|
||||
gateway_log = dump_text + "\n\n--- full gateway.log ---\n" + gateway_log
|
||||
if desktop_log:
|
||||
desktop_log = dump_text + "\n\n--- full desktop.log ---\n" + desktop_log
|
||||
if redact:
|
||||
report = _REDACTION_BANNER + report
|
||||
if agent_log:
|
||||
agent_log = _REDACTION_BANNER + agent_log
|
||||
if gateway_log:
|
||||
gateway_log = _REDACTION_BANNER + gateway_log
|
||||
if desktop_log:
|
||||
desktop_log = _REDACTION_BANNER + desktop_log
|
||||
print(report)
|
||||
for title, body in (
|
||||
("FULL agent.log", agent_log),
|
||||
("FULL gateway.log", gateway_log),
|
||||
("FULL desktop.log", desktop_log),
|
||||
):
|
||||
if body:
|
||||
print(f"\n\n{'=' * 60}")
|
||||
print(title)
|
||||
print(f"{'=' * 60}\n")
|
||||
print(body)
|
||||
return
|
||||
|
||||
print(_PRIVACY_NOTICE)
|
||||
print("Collecting debug report...")
|
||||
print("Uploading...")
|
||||
|
||||
try:
|
||||
urls["Report"] = upload_to_pastebin(report, expiry_days=expiry)
|
||||
result = build_debug_share(
|
||||
log_lines=log_lines,
|
||||
expiry=expiry,
|
||||
redact=redact,
|
||||
)
|
||||
except RuntimeError as exc:
|
||||
print(f"\nUpload failed: {exc}", file=sys.stderr)
|
||||
print("\nFull report printed below — copy-paste it manually:\n")
|
||||
print(report)
|
||||
print("\nRun `hermes debug share --local` to print the report instead.\n")
|
||||
sys.exit(1)
|
||||
|
||||
# 2. Full agent.log (optional)
|
||||
if agent_log:
|
||||
try:
|
||||
urls["agent.log"] = upload_to_pastebin(agent_log, expiry_days=expiry)
|
||||
except Exception as exc:
|
||||
failures.append(f"agent.log: {exc}")
|
||||
|
||||
# 3. Full gateway.log (optional)
|
||||
if gateway_log:
|
||||
try:
|
||||
urls["gateway.log"] = upload_to_pastebin(gateway_log, expiry_days=expiry)
|
||||
except Exception as exc:
|
||||
failures.append(f"gateway.log: {exc}")
|
||||
|
||||
# 4. Full desktop.log (optional — Electron app boot + backend output)
|
||||
if desktop_log:
|
||||
try:
|
||||
urls["desktop.log"] = upload_to_pastebin(desktop_log, expiry_days=expiry)
|
||||
except Exception as exc:
|
||||
failures.append(f"desktop.log: {exc}")
|
||||
|
||||
# Print results
|
||||
label_width = max(len(k) for k in urls)
|
||||
label_width = max(len(k) for k in result.urls)
|
||||
print(f"\nDebug report uploaded:")
|
||||
for label, url in urls.items():
|
||||
for label, url in result.urls.items():
|
||||
print(f" {label:<{label_width}} {url}")
|
||||
|
||||
if failures:
|
||||
print(f"\n (failed to upload: {', '.join(failures)})")
|
||||
if result.failures:
|
||||
print(f"\n (failed to upload: {', '.join(result.failures)})")
|
||||
|
||||
# Schedule auto-deletion after 6 hours
|
||||
_schedule_auto_delete(list(urls.values()))
|
||||
print(f"\n⏱ Pastes will auto-delete in 6 hours.")
|
||||
hours = result.auto_delete_seconds // 3600
|
||||
print(f"\n⏱ Pastes will auto-delete in {hours} hours.")
|
||||
|
||||
# Manual delete fallback
|
||||
print(f"To delete now: hermes debug delete <url>")
|
||||
|
||||
@ -1016,6 +1016,51 @@ async def run_config_migrate():
|
||||
return {"ok": True, "pid": proc.pid, "name": "config-migrate"}
|
||||
|
||||
|
||||
class DebugShareRequest(BaseModel):
|
||||
# Redaction is ON by default — force-mode scrubs credential-shaped tokens
|
||||
# out of log content before it leaves the machine. The toggle exists so an
|
||||
# operator who knows the logs are clean can opt out for fuller fidelity.
|
||||
redact: bool = True
|
||||
# Recent log lines included in the summary tail (full logs are separate).
|
||||
lines: int = 200
|
||||
|
||||
|
||||
@app.post("/api/ops/debug-share")
|
||||
async def run_debug_share_endpoint(body: DebugShareRequest | None = None):
|
||||
"""Upload a redacted debug report + full logs and return the paste URLs.
|
||||
|
||||
Unlike the other diagnostics actions (doctor, dump, prompt-size) this is
|
||||
*synchronous*: the whole point of ``debug share`` is the set of shareable
|
||||
URLs it produces, so we run the upload in a worker thread and return the
|
||||
structured ``{urls, failures, redacted, ...}`` payload directly. The
|
||||
dashboard renders those as real, copyable links instead of scraping a log
|
||||
tail. Pastes auto-delete after 6 hours (handled inside the share core).
|
||||
"""
|
||||
from hermes_cli.debug import build_debug_share
|
||||
|
||||
req = body or DebugShareRequest()
|
||||
try:
|
||||
result = await asyncio.to_thread(
|
||||
build_debug_share,
|
||||
log_lines=max(1, min(int(req.lines), 5000)),
|
||||
redact=bool(req.redact),
|
||||
)
|
||||
except RuntimeError as exc:
|
||||
# Required summary-report upload failed (offline / paste service down).
|
||||
raise HTTPException(status_code=502, detail=f"Upload failed: {exc}")
|
||||
except Exception as exc:
|
||||
_log.exception("debug share failed")
|
||||
raise HTTPException(status_code=500, detail=f"Failed: {exc}")
|
||||
|
||||
return {
|
||||
"ok": True,
|
||||
"urls": result.urls,
|
||||
"failures": result.failures,
|
||||
"redacted": result.redacted,
|
||||
"auto_delete_seconds": result.auto_delete_seconds,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Gateway + update actions (invoked from the Status page).
|
||||
#
|
||||
|
||||
Reference in New Issue
Block a user