fix(gateway): keep running when platforms fail; add per-platform circuit breaker + /platform (#26600)

Stop the gateway from exiting (or systemd-restart-looping) when a single
messaging adapter fails at startup or runtime.  A misconfigured WhatsApp
(npm install timeout, unpaired bridge, missing creds.json) used to take
the entire gateway down, killing cron jobs and any other connected
platforms with it.

Changes:

  • Startup (gateway/run.py): when connected_count==0 but the only
    errors are retryable, log a degraded-state warning and keep the
    gateway alive instead of returning False.  Reconnect watcher then
    recovers platforms as their underlying problem clears.

  • Runtime (gateway/run.py _handle_adapter_fatal_error): when the last
    adapter goes down with a retryable error and is queued for
    reconnection, stay alive instead of exit-with-failure.  Previously
    this triggered systemd Restart=on-failure, which created infinite
    restart loops on persistent retryable failures (proxy outage,
    repeated bridge crashes).

  • Reconnect watcher (gateway/run.py _platform_reconnect_watcher):
    replace the 20-attempt hard drop with a circuit-breaker pause.
    After _PAUSE_AFTER_FAILURES (10) consecutive retryable failures, the
    platform stays in _failed_platforms with paused=True so the watcher
    skips it but the operator can still see and resume it.  Non-retryable
    errors still drop out of the queue immediately.  Resolves #17063
    (gateway giving up on Telegram after 20 attempts).

  • WhatsApp preflight (gateway/platforms/whatsapp.py): refuse to start
    the Node bridge when creds.json is missing.  Sets a non-retryable
    whatsapp_not_paired fatal error so the watcher drops it cleanly
    with a single 'run hermes whatsapp' log line instead of paying the
    30s bridge bootstrap timeout on every gateway start.

  • WhatsApp setup ordering (hermes_cli/main.py cmd_whatsapp): only set
    WHATSAPP_ENABLED=true once pairing actually succeeds.  Previously
    the wizard wrote the env var at step 2 (before npm install and QR
    pairing), so any Ctrl+C left .env claiming WhatsApp was ready when
    the bridge had no creds.json.  Also propagate the env var when the
    user keeps an existing pairing on a re-run.

  • /platform slash command (hermes_cli/commands.py + gateway/run.py):
    new gateway-only command for manual circuit-breaker control.
      /platform list           — show connected + failed/paused platforms
      /platform pause <name>   — silence a known-broken platform
      /platform resume <name>  — re-queue a paused platform

Tests:

  • New: pause/resume helpers, /platform list|pause|resume command,
    WhatsApp creds.json preflight, WhatsApp setup ordering.
  • Updated: stale assertions that codified the old 'exit and let
    systemd restart' behavior in test_runner_fatal_adapter.py,
    test_runner_startup_failures.py, and test_platform_reconnect.py
    (the 20-attempt give-up test became a circuit-breaker pause test).

5488 tests pass in tests/gateway/.
This commit is contained in:
Teknium
2026-05-15 14:32:14 -07:00
committed by GitHub
parent 3b9368a0c4
commit 518f39557b
9 changed files with 745 additions and 62 deletions

View File

@ -493,13 +493,45 @@ class WhatsAppAdapter(BasePlatformAdapter):
"""
if not check_whatsapp_requirements():
logger.warning("[%s] Node.js not found. WhatsApp requires Node.js.", self.name)
self._set_fatal_error(
"whatsapp_node_missing",
"Node.js is not installed — install Node.js and re-run `hermes gateway`.",
retryable=False,
)
return False
bridge_path = Path(self._bridge_script)
if not bridge_path.exists():
logger.warning("[%s] Bridge script not found: %s", self.name, bridge_path)
self._set_fatal_error(
"whatsapp_bridge_missing",
f"WhatsApp bridge script missing at {bridge_path}.",
retryable=False,
)
return False
# Pre-flight: skip the 30s bridge bootstrap entirely if the user
# never finished pairing. Without creds.json the bridge prints
# QR codes to its log file and never reaches status:connected,
# so every gateway restart paid the 30s timeout + queued WhatsApp
# for indefinite retries. Mark non-retryable so the user gets a
# clear "run hermes whatsapp" message instead of the watcher
# silently hammering an unconfigured platform.
creds_path = self._session_path / "creds.json"
if not creds_path.exists():
logger.warning(
"[%s] WhatsApp is enabled but not paired (no creds.json at %s). "
"Run `hermes whatsapp` to pair, or remove WHATSAPP_ENABLED from "
"your .env to disable.",
self.name, creds_path,
)
self._set_fatal_error(
"whatsapp_not_paired",
"WhatsApp enabled but not paired — run `hermes whatsapp` to pair.",
retryable=False,
)
return False
logger.info("[%s] Bridge found at %s", self.name, bridge_path)
# Acquire scoped lock to prevent duplicate sessions

View File

@ -1990,21 +1990,21 @@ class GatewayRunner:
await self.stop()
elif not self.adapters and self._failed_platforms:
# All platforms are down and queued for background reconnection.
# If the error is retryable, exit with failure so systemd Restart=on-failure
# can restart the process. Otherwise stay alive and keep retrying in background.
if adapter.fatal_error_retryable:
self._exit_reason = adapter.fatal_error_message or "All messaging platforms failed with retryable errors"
self._exit_with_failure = True
logger.error(
"All messaging platforms failed with retryable errors. "
"Shutting down gateway for service restart (systemd will retry)."
)
await self.stop()
else:
logger.warning(
"No connected messaging platforms remain, but %d platform(s) queued for reconnection",
len(self._failed_platforms),
)
# Keep the gateway alive so:
# • cron jobs still run
# • the reconnect watcher can recover platforms when the
# underlying problem clears (proxy comes back, user runs
# `hermes whatsapp`, etc.)
# We used to exit-with-failure here to trigger systemd restart,
# but that converted a transient outage into a restart loop and
# killed in-process state every time. The reconnect watcher
# already handles long-running recovery — let it do its job.
logger.warning(
"No connected messaging platforms remain, but %d platform(s) "
"queued for reconnection — gateway staying alive, watcher will "
"retry in background.",
len(self._failed_platforms),
)
def _request_clean_exit(self, reason: str) -> None:
self._exit_cleanly = True
@ -2180,6 +2180,73 @@ class GatewayRunner:
except Exception:
pass
# ------------------------------------------------------------------
# Per-platform circuit breaker (pause/resume) — used by the reconnect
# watcher when a retryable failure recurs past a threshold, and by the
# /platform pause|resume slash command for manual control.
# ------------------------------------------------------------------
def _pause_failed_platform(self, platform, *, reason: str = "") -> None:
"""Mark a queued platform as paused — keep it in ``_failed_platforms``
but stop the reconnect watcher from hammering it.
Used by the circuit breaker after ``_PAUSE_AFTER_FAILURES`` consecutive
retryable failures, and by ``/platform pause <name>`` for manual
intervention. Paused platforms are surfaced in ``/platform list``
and resumed with ``/platform resume <name>``.
"""
info = getattr(self, "_failed_platforms", {}).get(platform)
if info is None:
return
if info.get("paused"):
return
info["paused"] = True
info["pause_reason"] = reason or "auto-paused after repeated failures"
# Push next_retry far enough out that even if "paused" is missed
# by a stale code path, the watcher won't fire on it.
info["next_retry"] = float("inf")
try:
self._update_platform_runtime_status(
platform.value,
platform_state="paused",
error_code=None,
error_message=info["pause_reason"],
)
except Exception:
pass
logger.warning(
"%s paused after %d consecutive failures (%s) — "
"fix the underlying issue then run `/platform resume %s` "
"to retry, or `hermes gateway restart` to restart the gateway.",
platform.value, info.get("attempts", 0),
info["pause_reason"], platform.value,
)
def _resume_paused_platform(self, platform) -> bool:
"""Unpause a platform — reset its attempt counter and schedule an
immediate retry. Returns True if the platform was paused and is
now queued; False if it wasn't paused (or wasn't in the queue).
"""
info = getattr(self, "_failed_platforms", {}).get(platform)
if info is None:
return False
if not info.get("paused"):
return False
info["paused"] = False
info.pop("pause_reason", None)
info["attempts"] = 0
info["next_retry"] = time.monotonic() # retry on next watcher tick
try:
self._update_platform_runtime_status(
platform.value,
platform_state="retrying",
error_code=None,
error_message=None,
)
except Exception:
pass
logger.info("%s resumed — retrying on next watcher tick", platform.value)
return True
@staticmethod
def _load_prefill_messages() -> List[Dict[str, Any]]:
"""Load ephemeral prefill messages from config or env var.
@ -3613,16 +3680,32 @@ class GatewayRunner:
return True
if enabled_platform_count > 0:
if startup_retryable_errors:
# At least one platform attempted a connection and failed —
# this is a real startup error that should block the gateway.
# All enabled platforms hit retryable failures (network
# blip, bridge not paired, npm install timeout, etc.).
# Keep the gateway alive so:
# • cron jobs still run
# • the reconnect watcher gets a chance to recover the
# failing platforms once the underlying problem is
# fixed (e.g. user runs `hermes whatsapp`, fixes
# proxy, etc.)
# Exiting here used to convert a single misconfigured
# platform into an infinite systemd restart loop.
reason = "; ".join(startup_retryable_errors)
logger.error("Gateway failed to connect any configured messaging platform: %s", reason)
logger.warning(
"Gateway started with no connected platforms — "
"%d platform(s) queued for retry: %s",
len(self._failed_platforms), reason,
)
try:
from gateway.status import write_runtime_status
write_runtime_status(gateway_state="startup_failed", exit_reason=reason)
write_runtime_status(
gateway_state="degraded",
exit_reason=None,
)
except Exception:
pass
return False
# Fall through to the normal "running" state — reconnect
# watcher takes it from here.
# All enabled platforms had no adapter (missing library or credentials).
# In fleet deployments the same config.yaml is shared across nodes that
# may only have credentials for a subset of platforms. Rather than
@ -4737,11 +4820,15 @@ class GatewayRunner:
"""Background task that periodically retries connecting failed platforms.
Uses exponential backoff: 30s → 60s → 120s → 240s → 300s (cap).
Stops retrying a platform after 20 failed attempts or if the error
is non-retryable (e.g. bad auth token).
Retryable failures keep retrying at the backoff cap indefinitely
— but if a platform fails ``_PAUSE_AFTER_FAILURES`` times in a row
without ever succeeding, it is *paused*: kept in the retry queue
but no longer hammered. The user surfaces it with ``/platform list``
and resumes it with ``/platform resume <name>``. Non-retryable
failures (bad auth, etc.) still drop out of the queue immediately.
"""
_MAX_ATTEMPTS = 20
_BACKOFF_CAP = 300 # 5 minutes max between retries
_PAUSE_AFTER_FAILURES = 10 # circuit-breaker threshold
await asyncio.sleep(10) # initial delay — let startup finish
while self._running:
@ -4758,22 +4845,18 @@ class GatewayRunner:
if not self._running:
return
info = self._failed_platforms[platform]
# Skip paused platforms entirely — they need explicit
# /platform resume to come back.
if info.get("paused"):
continue
if now < info["next_retry"]:
continue # not time yet
if info["attempts"] >= _MAX_ATTEMPTS:
logger.warning(
"Giving up reconnecting %s after %d attempts",
platform.value, info["attempts"],
)
del self._failed_platforms[platform]
continue
platform_config = info["config"]
attempt = info["attempts"] + 1
logger.info(
"Reconnecting %s (attempt %d/%d)...",
platform.value, attempt, _MAX_ATTEMPTS,
"Reconnecting %s (attempt %d)...",
platform.value, attempt,
)
try:
@ -4838,6 +4921,14 @@ class GatewayRunner:
"Reconnect %s failed, next retry in %ds",
platform.value, backoff,
)
if attempt >= _PAUSE_AFTER_FAILURES:
self._pause_failed_platform(
platform,
reason=(
adapter.fatal_error_message
or "failed to reconnect"
),
)
except Exception as e:
self._update_platform_runtime_status(
platform.value,
@ -4852,6 +4943,8 @@ class GatewayRunner:
"Reconnect %s error: %s, next retry in %ds",
platform.value, e, backoff,
)
if attempt >= _PAUSE_AFTER_FAILURES:
self._pause_failed_platform(platform, reason=str(e))
# Check every 10 seconds for platforms that need reconnection
for _ in range(10):
@ -6451,6 +6544,9 @@ class GatewayRunner:
if canonical == "agents":
return await self._handle_agents_command(event)
if canonical == "platform":
return await self._handle_platform_command(event)
if canonical == "restart":
return await self._handle_restart_command(event)
@ -8698,6 +8794,99 @@ class GatewayRunner:
else:
return t("gateway.stop.no_active")
async def _handle_platform_command(self, event: MessageEvent) -> str:
"""Handle ``/platform list|pause|resume [name]`` — surface and
manually control failed/paused gateway adapters.
Examples:
``/platform list`` — show connected + failed/paused platforms
``/platform pause whatsapp`` — stop the reconnect watcher hammering whatsapp
``/platform resume whatsapp`` — re-queue a paused platform for retry
"""
text = (getattr(event, "content", "") or "").strip()
# Strip the leading "/platform" (or "/PLATFORM") token if present
parts = text.split(maxsplit=2)
if parts and parts[0].lower().lstrip("/").startswith("platform"):
parts = parts[1:]
action = (parts[0] if parts else "list").lower()
target = parts[1].lower() if len(parts) > 1 else ""
# Resolve platform name (case-insensitive, value match)
def _resolve_platform(name: str):
if not name:
return None
for p in Platform.__members__.values():
if p.value.lower() == name:
return p
return None
if action == "list":
lines = ["**Gateway platforms**"]
connected = sorted(p.value for p in self.adapters.keys())
if connected:
lines.append("Connected: " + ", ".join(connected))
else:
lines.append("Connected: (none)")
failed = getattr(self, "_failed_platforms", {}) or {}
if failed:
for p, info in failed.items():
if info.get("paused"):
reason = info.get("pause_reason") or "paused"
lines.append(
f" · {p.value} — PAUSED ({reason}). "
f"Resume with `/platform resume {p.value}`."
)
else:
attempts = info.get("attempts", 0)
lines.append(
f" · {p.value} — retrying (attempt {attempts})"
)
else:
lines.append("Failed/paused: (none)")
return "\n".join(lines)
if action in ("pause", "resume"):
if not target:
return f"Usage: /platform {action} <name>"
platform = _resolve_platform(target)
if platform is None:
return f"Unknown platform: {target}"
failed = getattr(self, "_failed_platforms", {}) or {}
if action == "pause":
if platform not in failed:
return (
f"{platform.value} is not in the retry queue "
f"(it's either connected or not enabled)."
)
if failed[platform].get("paused"):
return f"{platform.value} is already paused."
self._pause_failed_platform(platform, reason="paused via /platform pause")
return (
f"{platform.value} paused. "
f"Resume with `/platform resume {platform.value}` or "
f"`hermes gateway restart` to reset."
)
# action == "resume"
if platform not in failed:
return (
f"{platform.value} is not in the retry queue — "
f"nothing to resume."
)
if not failed[platform].get("paused"):
return (
f"{platform.value} is already retrying — "
f"no resume needed."
)
self._resume_paused_platform(platform)
return f"{platform.value} resumed — retrying on next watcher tick."
return (
"Usage: /platform <list|pause|resume> [name]\n"
" /platform list — show platform status\n"
" /platform pause <name> — stop retrying a failing platform\n"
" /platform resume <name> — re-queue a paused platform"
)
async def _handle_restart_command(self, event: MessageEvent) -> Union[str, EphemeralReply]:
"""Handle /restart command - drain active work, then restart the gateway."""
# Defensive idempotency check: if the previous gateway process

View File

@ -198,6 +198,8 @@ COMMAND_REGISTRY: list[CommandDef] = [
args_hint="[days]"),
CommandDef("platforms", "Show gateway/messaging platform status", "Info",
cli_only=True, aliases=("gateway",)),
CommandDef("platform", "Pause, resume, or list a failing gateway platform", "Info",
gateway_only=True, args_hint="<pause|resume|list> [name]"),
CommandDef("copy", "Copy the last assistant response to clipboard", "Info",
cli_only=True, args_hint="[number]"),
CommandDef("paste", "Attach clipboard image from your clipboard", "Info",

View File

@ -1522,14 +1522,18 @@ def cmd_whatsapp(args):
)
print(f"\n✓ Mode: {mode_label}")
# ── Step 2: Enable WhatsApp ──────────────────────────────────────────
# ── Step 2: Mode is selected, will enable WhatsApp only after pairing ──
# We intentionally don't write WHATSAPP_ENABLED=true here. If the user
# aborts the wizard later (Ctrl+C, failed npm install, missed QR scan),
# we'd otherwise leave .env claiming WhatsApp is ready when the bridge
# has no creds.json. Every subsequent `hermes gateway` then paid a 30s
# bridge-bootstrap timeout and queued WhatsApp for indefinite retries.
# Now: aborted setup leaves WHATSAPP_ENABLED unset → gateway skips it.
# Re-runs that already have WHATSAPP_ENABLED=true (from a prior
# successful pairing) stay enabled — we just don't write it pre-emptively.
print()
current = get_env_value("WHATSAPP_ENABLED")
if current and current.lower() == "true":
if (get_env_value("WHATSAPP_ENABLED") or "").lower() == "true":
print("✓ WhatsApp is already enabled")
else:
save_env_value("WHATSAPP_ENABLED", "true")
print("✓ WhatsApp enabled")
# ── Step 3: Allowed users ────────────────────────────────────────────
current_users = get_env_value("WHATSAPP_ALLOWED_USERS") or ""
@ -1619,6 +1623,12 @@ def cmd_whatsapp(args):
session_dir.mkdir(parents=True, exist_ok=True)
print(" ✓ Session cleared")
else:
# Existing pairing — ensure WHATSAPP_ENABLED reflects that.
# (Older installs may have lost the env var; covers re-runs
# where the user picked "no, keep my session" but the var
# was never set or got removed.)
if (get_env_value("WHATSAPP_ENABLED") or "").lower() != "true":
save_env_value("WHATSAPP_ENABLED", "true")
print("\n✓ WhatsApp is configured and paired!")
print(" Start the gateway with: hermes gateway")
return
@ -1647,6 +1657,11 @@ def cmd_whatsapp(args):
# ── Step 7: Post-pairing ─────────────────────────────────────────────
print()
if (session_dir / "creds.json").exists():
# Only enable WhatsApp now that pairing actually succeeded. If the
# user Ctrl+C'd at any earlier step, WHATSAPP_ENABLED stays unset
# and `hermes gateway` skips it cleanly instead of paying a 30s
# bridge timeout + queueing the platform for indefinite retries.
save_env_value("WHATSAPP_ENABLED", "true")
print("✓ WhatsApp paired successfully!")
print()
if wa_mode == "bot":

View File

@ -294,15 +294,63 @@ class TestPlatformReconnectWatcher:
assert runner._failed_platforms[Platform.TELEGRAM]["attempts"] == 2
@pytest.mark.asyncio
async def test_reconnect_gives_up_after_max_attempts(self):
"""After max attempts, platform should be removed from retry queue."""
async def test_reconnect_pauses_after_circuit_breaker_threshold(self):
"""After enough consecutive retryable failures, the watcher should
*pause* the platform (keep it in the queue but stop hammering it),
not drop it. The user resumes via /platform resume.
"""
runner = _make_runner()
platform_config = PlatformConfig(enabled=True, token="test")
# 9 prior attempts — the next failure will be the 10th and should
# trip the circuit breaker.
runner._failed_platforms[Platform.TELEGRAM] = {
"config": platform_config,
"attempts": 9,
"next_retry": time.monotonic() - 1,
}
fail_adapter = StubAdapter(
succeed=False, fatal_error="DNS failure", fatal_retryable=True
)
real_sleep = asyncio.sleep
with patch.object(runner, "_create_adapter", return_value=fail_adapter):
async def run_one_iteration():
runner._running = True
call_count = 0
async def fake_sleep(n):
nonlocal call_count
call_count += 1
if call_count > 1:
runner._running = False
await real_sleep(0)
with patch("asyncio.sleep", side_effect=fake_sleep):
await runner._platform_reconnect_watcher()
await run_one_iteration()
# Platform stays in queue — paused, not dropped
assert Platform.TELEGRAM in runner._failed_platforms
info = runner._failed_platforms[Platform.TELEGRAM]
assert info["paused"] is True
assert info["attempts"] == 10
assert "pause_reason" in info
@pytest.mark.asyncio
async def test_reconnect_skips_paused_platforms(self):
"""A paused platform should not be retried by the watcher tick."""
runner = _make_runner()
platform_config = PlatformConfig(enabled=True, token="test")
runner._failed_platforms[Platform.TELEGRAM] = {
"config": platform_config,
"attempts": 20, # At max
"next_retry": time.monotonic() - 1,
"attempts": 10,
"next_retry": time.monotonic() - 1, # would normally retry now
"paused": True,
"pause_reason": "paused via /platform pause",
}
real_sleep = asyncio.sleep
@ -324,8 +372,10 @@ class TestPlatformReconnectWatcher:
await run_one_iteration()
assert Platform.TELEGRAM not in runner._failed_platforms
mock_create.assert_not_called() # Should give up without trying
# Paused platform stays queued and was never touched
assert Platform.TELEGRAM in runner._failed_platforms
assert runner._failed_platforms[Platform.TELEGRAM]["paused"] is True
mock_create.assert_not_called()
@pytest.mark.asyncio
async def test_reconnect_skips_when_not_time_yet(self):
@ -459,11 +509,12 @@ class TestRuntimeDisconnectQueuing:
assert Platform.TELEGRAM not in runner._failed_platforms
@pytest.mark.asyncio
async def test_retryable_error_exits_for_service_restart_when_all_down(self):
"""Gateway should exit with failure when all platforms fail with retryable errors.
This lets systemd Restart=on-failure restart the process, which is more
reliable than in-process background reconnection after exhausted retries.
async def test_retryable_error_keeps_gateway_alive_when_all_down(self):
"""When all adapters fail at runtime with retryable errors, the
gateway should stay alive and let the reconnect watcher recover them
in the background. (Previously this exited-with-failure to trigger
a systemd restart — that converted transient outages into infinite
restart loops and killed in-process state.)
"""
runner = _make_runner()
runner.stop = AsyncMock()
@ -474,9 +525,9 @@ class TestRuntimeDisconnectQueuing:
await runner._handle_adapter_fatal_error(adapter)
# stop() SHOULD be called — gateway exits for systemd restart
runner.stop.assert_called_once()
assert runner._exit_with_failure is True
# stop() should NOT be called — gateway stays alive for the watcher
runner.stop.assert_not_called()
assert runner._exit_with_failure is False
assert Platform.TELEGRAM in runner._failed_platforms
@pytest.mark.asyncio
@ -512,3 +563,154 @@ class TestRuntimeDisconnectQueuing:
await runner._handle_adapter_fatal_error(adapter)
runner.stop.assert_called_once()
# --- Pause / resume circuit breaker ---
class TestPauseResume:
"""Test the per-platform pause/resume helpers and slash command."""
def test_pause_marks_platform_paused(self):
runner = _make_runner()
runner._failed_platforms[Platform.TELEGRAM] = {
"config": PlatformConfig(enabled=True, token="t"),
"attempts": 3,
"next_retry": time.monotonic() + 30,
}
runner._pause_failed_platform(Platform.TELEGRAM, reason="manual")
info = runner._failed_platforms[Platform.TELEGRAM]
assert info["paused"] is True
assert info["pause_reason"] == "manual"
assert info["next_retry"] == float("inf")
def test_pause_is_idempotent(self):
runner = _make_runner()
runner._failed_platforms[Platform.TELEGRAM] = {
"config": PlatformConfig(enabled=True, token="t"),
"attempts": 3,
"next_retry": time.monotonic() + 30,
"paused": True,
"pause_reason": "first reason",
}
runner._pause_failed_platform(Platform.TELEGRAM, reason="second reason")
# Reason should not be overwritten on a second pause call.
assert (
runner._failed_platforms[Platform.TELEGRAM]["pause_reason"]
== "first reason"
)
def test_pause_no_op_when_platform_not_queued(self):
runner = _make_runner()
# No exception even when the platform isn't in _failed_platforms.
runner._pause_failed_platform(Platform.TELEGRAM, reason="x")
assert Platform.TELEGRAM not in runner._failed_platforms
def test_resume_clears_paused_and_resets_attempts(self):
runner = _make_runner()
runner._failed_platforms[Platform.TELEGRAM] = {
"config": PlatformConfig(enabled=True, token="t"),
"attempts": 10,
"next_retry": float("inf"),
"paused": True,
"pause_reason": "auto-paused",
}
assert runner._resume_paused_platform(Platform.TELEGRAM) is True
info = runner._failed_platforms[Platform.TELEGRAM]
assert info["paused"] is False
assert info["attempts"] == 0
assert info["next_retry"] != float("inf")
assert "pause_reason" not in info
def test_resume_returns_false_when_not_paused(self):
runner = _make_runner()
runner._failed_platforms[Platform.TELEGRAM] = {
"config": PlatformConfig(enabled=True, token="t"),
"attempts": 1,
"next_retry": time.monotonic() + 30,
}
assert runner._resume_paused_platform(Platform.TELEGRAM) is False
def test_resume_returns_false_when_not_queued(self):
runner = _make_runner()
assert runner._resume_paused_platform(Platform.TELEGRAM) is False
class TestPlatformSlashCommand:
"""Test the /platform list|pause|resume slash command handler."""
def _make_event(self, content: str):
ev = MagicMock()
ev.content = content
return ev
@pytest.mark.asyncio
async def test_list_shows_connected_and_paused(self):
runner = _make_runner()
runner.adapters[Platform.DISCORD] = StubAdapter(platform=Platform.DISCORD)
runner._failed_platforms[Platform.WHATSAPP] = {
"config": PlatformConfig(enabled=True, token="t"),
"attempts": 10,
"next_retry": float("inf"),
"paused": True,
"pause_reason": "not paired",
}
out = await runner._handle_platform_command(self._make_event("/platform list"))
assert "discord" in out
assert "whatsapp" in out
assert "PAUSED" in out
assert "not paired" in out
@pytest.mark.asyncio
async def test_pause_command_pauses_queued_platform(self):
runner = _make_runner()
runner._failed_platforms[Platform.WHATSAPP] = {
"config": PlatformConfig(enabled=True, token="t"),
"attempts": 2,
"next_retry": time.monotonic() + 30,
}
out = await runner._handle_platform_command(
self._make_event("/platform pause whatsapp")
)
assert "paused" in out.lower()
assert runner._failed_platforms[Platform.WHATSAPP]["paused"] is True
@pytest.mark.asyncio
async def test_pause_rejects_unqueued_platform(self):
runner = _make_runner()
out = await runner._handle_platform_command(
self._make_event("/platform pause whatsapp")
)
assert "not in the retry queue" in out
@pytest.mark.asyncio
async def test_resume_command_resumes_paused_platform(self):
runner = _make_runner()
runner._failed_platforms[Platform.WHATSAPP] = {
"config": PlatformConfig(enabled=True, token="t"),
"attempts": 10,
"next_retry": float("inf"),
"paused": True,
"pause_reason": "x",
}
out = await runner._handle_platform_command(
self._make_event("/platform resume whatsapp")
)
assert "resumed" in out.lower()
assert runner._failed_platforms[Platform.WHATSAPP]["paused"] is False
@pytest.mark.asyncio
async def test_unknown_platform_name(self):
runner = _make_runner()
out = await runner._handle_platform_command(
self._make_event("/platform pause notarealplatform")
)
assert "Unknown platform" in out
@pytest.mark.asyncio
async def test_bare_platform_shows_usage_with_list(self):
# An empty /platform call defaults to "list".
runner = _make_runner()
out = await runner._handle_platform_command(self._make_event("/platform"))
assert "Gateway platforms" in out

View File

@ -68,7 +68,11 @@ async def test_runner_requests_clean_exit_for_nonretryable_startup_conflict(monk
@pytest.mark.asyncio
async def test_runner_queues_retryable_runtime_fatal_for_reconnection(monkeypatch, tmp_path):
"""Retryable runtime fatal errors queue the platform for reconnection
instead of shutting down the gateway."""
AND keep the gateway alive — the background reconnect watcher recovers
the platform when the underlying issue clears. (Previously this
exited-with-failure to trigger a systemd restart; that converted
transient failures into infinite restart loops.)
"""
config = GatewayConfig(
platforms={
Platform.WHATSAPP: PlatformConfig(enabled=True, token="token")
@ -89,8 +93,8 @@ async def test_runner_queues_retryable_runtime_fatal_for_reconnection(monkeypatc
await runner._handle_adapter_fatal_error(adapter)
# Should shut down with failure — systemd Restart=on-failure will restart
runner.stop.assert_awaited_once()
assert runner._exit_with_failure is True
# Gateway stays alive — watcher will retry in background
runner.stop.assert_not_awaited()
assert runner._exit_with_failure is False
assert Platform.WHATSAPP in runner._failed_platforms
assert runner._failed_platforms[Platform.WHATSAPP]["attempts"] == 0

View File

@ -64,7 +64,14 @@ class _SuccessfulAdapter(BasePlatformAdapter):
@pytest.mark.asyncio
async def test_runner_returns_failure_for_retryable_startup_errors(monkeypatch, tmp_path):
async def test_runner_stays_alive_for_retryable_startup_errors(monkeypatch, tmp_path):
"""Retryable startup errors should leave the gateway running in
degraded mode so the reconnect watcher can recover the platform when
the underlying problem clears. Previously this returned False from
``start()`` and exited the process, which converted a single broken
platform (e.g. unpaired WhatsApp, DNS blip on Telegram) into a
systemd restart loop and killed cron jobs in the meantime.
"""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
config = GatewayConfig(
platforms={
@ -78,11 +85,13 @@ async def test_runner_returns_failure_for_retryable_startup_errors(monkeypatch,
ok = await runner.start()
assert ok is False
# Gateway stays alive in degraded mode; reconnect watcher takes over.
assert ok is True
assert runner.should_exit_cleanly is False
state = read_runtime_status()
assert state["gateway_state"] == "startup_failed"
assert "temporary DNS resolution failure" in state["exit_reason"]
assert state["gateway_state"] in {"degraded", "running"}
# Telegram was queued for retry, not given up on.
assert Platform.TELEGRAM in runner._failed_platforms
assert state["platforms"]["telegram"]["state"] == "retrying"
assert state["platforms"]["telegram"]["error_code"] == "telegram_connect_error"

View File

@ -611,3 +611,93 @@ class TestHttpSessionLifecycle:
mock_task.cancel.assert_not_called()
assert adapter._poll_task is None
# ---------------------------------------------------------------------------
# Pre-flight: refuse to start the bridge when creds.json is missing
# ---------------------------------------------------------------------------
class TestNoCredsPreflight:
"""Verify ``connect()`` fast-fails as non-retryable when WhatsApp is
enabled but the user never finished pairing (no ``creds.json``).
Without this guard, every gateway boot:
• spawned the bridge subprocess (npm install if needed)
• waited 30s for status:connected (never happens without creds)
• queued WhatsApp for indefinite retries that would just repeat
With the guard, ``connect()`` returns False immediately with a
non-retryable fatal error so the reconnect watcher drops the platform
and the gateway gets a single clear log line telling the user to run
``hermes whatsapp``.
"""
@pytest.mark.asyncio
async def test_connect_returns_false_when_no_creds(self, tmp_path):
from gateway.platforms.whatsapp import WhatsAppAdapter
adapter = WhatsAppAdapter.__new__(WhatsAppAdapter)
adapter.platform = Platform.WHATSAPP
adapter.config = MagicMock()
adapter._bridge_port = 19876
# Point bridge_script at a real existing file so the earlier
# bridge-missing check doesn't trip — we want to exercise the
# creds.json check specifically.
bridge = tmp_path / "bridge.js"
bridge.write_text("// stub")
adapter._bridge_script = str(bridge)
adapter._session_path = tmp_path / "session" # no creds.json inside
adapter._session_path.mkdir()
adapter._bridge_log_fh = None
adapter._fatal_error_code = None
adapter._fatal_error_message = None
adapter._fatal_error_retryable = True
with patch(
"gateway.platforms.whatsapp.check_whatsapp_requirements",
return_value=True,
):
result = await adapter.connect()
assert result is False
# Non-retryable so the reconnect watcher drops it cleanly
assert adapter._fatal_error_code == "whatsapp_not_paired"
assert adapter._fatal_error_retryable is False
@pytest.mark.asyncio
async def test_connect_proceeds_when_creds_present(self, tmp_path):
"""When creds.json exists, the preflight check is bypassed and
connect() proceeds to the bridge bootstrap path. We don't fully
simulate the bridge here — we just verify no fast-fail occurs.
"""
from gateway.platforms.whatsapp import WhatsAppAdapter
adapter = WhatsAppAdapter.__new__(WhatsAppAdapter)
adapter.platform = Platform.WHATSAPP
adapter.config = MagicMock()
adapter._bridge_port = 19877
bridge = tmp_path / "bridge.js"
bridge.write_text("// stub")
adapter._bridge_script = str(bridge)
session_dir = tmp_path / "session"
session_dir.mkdir()
(session_dir / "creds.json").write_text("{}")
adapter._session_path = session_dir
adapter._bridge_log_fh = None
adapter._fatal_error_code = None
adapter._fatal_error_message = None
adapter._fatal_error_retryable = True
# Stub _acquire_platform_lock to return False so connect() exits
# cleanly *after* the preflight, without spawning subprocesses.
adapter._acquire_platform_lock = MagicMock(return_value=False)
with patch(
"gateway.platforms.whatsapp.check_whatsapp_requirements",
return_value=True,
):
result = await adapter.connect()
# Preflight passed — exits because we faked lock acquisition,
# but the fatal-error code is NOT the "not paired" one.
assert result is False
assert adapter._fatal_error_code != "whatsapp_not_paired"

View File

@ -0,0 +1,140 @@
"""Regression tests for ``cmd_whatsapp`` env-var write ordering.
Before the fix, ``hermes whatsapp`` wrote ``WHATSAPP_ENABLED=true`` at
step 2 — before npm install (step 4) and before QR pairing (step 6).
If the user Ctrl+C'd at any later step, ``.env`` claimed WhatsApp was
ready when the bridge still had no ``creds.json``. Every subsequent
``hermes gateway`` then paid a 30s bridge-bootstrap timeout and queued
WhatsApp for indefinite retries — looking like "the gateway is broken."
The fix: only set ``WHATSAPP_ENABLED=true`` once pairing actually
succeeds (creds.json exists). Aborted setup leaves no enabled state.
"""
from __future__ import annotations
import io
import os
from contextlib import redirect_stdout
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
@pytest.fixture
def isolated_home(tmp_path, monkeypatch):
home = tmp_path / "home"
hermes = home / ".hermes"
hermes.mkdir(parents=True)
monkeypatch.setattr(Path, "home", lambda: home)
monkeypatch.setenv("HERMES_HOME", str(hermes))
# Ensure get_env_value cache doesn't carry stale state.
for key in list(os.environ):
if key.startswith("WHATSAPP_"):
monkeypatch.delenv(key, raising=False)
return hermes
def _env_value(hermes_home: Path, key: str) -> str | None:
env_file = hermes_home / ".env"
if not env_file.exists():
return None
for line in env_file.read_text().splitlines():
if "=" not in line:
continue
k, _, v = line.partition("=")
if k.strip() == key:
return v.strip().strip('"').strip("'")
return None
def test_aborted_setup_does_not_enable_whatsapp(isolated_home, monkeypatch):
"""User picks mode 1, then Ctrl+C's at the allowed-users prompt.
WHATSAPP_ENABLED must NOT be present in .env after abort.
"""
from hermes_cli.main import cmd_whatsapp
# First input() = mode choice, second input() = allowed-users prompt
# We raise KeyboardInterrupt on the second call to simulate abort.
inputs = iter(["1"])
def fake_input(_prompt=""):
try:
return next(inputs)
except StopIteration:
raise KeyboardInterrupt
monkeypatch.setattr("builtins.input", fake_input)
# _require_tty calls sys.stdin.isatty — make it pass.
monkeypatch.setattr("hermes_cli.main._require_tty", lambda *_a, **_kw: None)
# No node, no bridge script — we shouldn't reach those steps anyway.
buf = io.StringIO()
with redirect_stdout(buf):
try:
cmd_whatsapp(MagicMock())
except KeyboardInterrupt:
pass
assert _env_value(isolated_home, "WHATSAPP_ENABLED") is None, (
"Setup aborted before pairing — WHATSAPP_ENABLED must not be set. "
f"Got .env: {(isolated_home / '.env').read_text() if (isolated_home / '.env').exists() else '(missing)'}"
)
def test_existing_pairing_skip_branch_enables_whatsapp(isolated_home, monkeypatch):
"""User runs ``hermes whatsapp`` with an existing paired session and
chooses "no, keep my session" at the re-pair prompt. The env var
should be (re-)written to true so the gateway picks WhatsApp back up,
even if the var was lost since the original pairing.
"""
from hermes_cli.main import cmd_whatsapp
# Pre-create a paired session WITHOUT WHATSAPP_ENABLED in .env.
session = isolated_home / "whatsapp" / "session"
session.mkdir(parents=True)
(session / "creds.json").write_text("{}")
monkeypatch.setenv("WHATSAPP_MODE", "bot")
monkeypatch.setenv("WHATSAPP_ALLOWED_USERS", "15551234567")
# mode already set → skip mode prompt; users already set → skip update
# prompt with "no"; pairing exists → "no, keep session" → return.
inputs = iter(["n", "n"])
def fake_input(_prompt=""):
try:
return next(inputs)
except StopIteration:
return "n"
monkeypatch.setattr("builtins.input", fake_input)
monkeypatch.setattr("hermes_cli.main._require_tty", lambda *_a, **_kw: None)
# Skip the bridge npm install — we're testing setup-ordering, not bridge
# bootstrapping. Pretend node_modules exists (Path.exists -> True for that
# specific check is hard to scope, so instead pretend npm install would
# succeed silently if reached).
monkeypatch.setattr(
"subprocess.run",
lambda *_a, **_kw: MagicMock(returncode=0, stderr=""),
)
monkeypatch.setattr("shutil.which", lambda _name: "/usr/bin/npm")
# Patch (bridge_dir / "node_modules").exists() by stubbing Path.exists
# to True for that one specific subpath. Easier: pre-create it as a
# symlink to /tmp. But we can't write to the repo. Instead, stub
# Path.exists wholesale to True for node_modules; the creds.json check
# in the same function still works because we wrote it ourselves.
_orig_exists = Path.exists
def _stub_exists(self):
if self.name == "node_modules":
return True
return _orig_exists(self)
monkeypatch.setattr(Path, "exists", _stub_exists)
buf = io.StringIO()
with redirect_stdout(buf):
cmd_whatsapp(MagicMock())
# The skip-rebar branch should have set the env var on its way out.
assert _env_value(isolated_home, "WHATSAPP_ENABLED") == "true"