diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py index 5e49228bc..da76af8b1 100644 --- a/agent/conversation_loop.py +++ b/agent/conversation_loop.py @@ -4303,36 +4303,54 @@ def run_conversation( ) final_response = agent._handle_max_iterations(messages, api_call_count) - # If running as a kanban worker, block the task so the dispatcher - # knows the worker could not complete (rather than treating it as a + # If running as a kanban worker, signal the dispatcher that the + # worker could not complete (rather than treating it as a # protocol violation). The agent loop strips tools before calling # _handle_max_iterations, so the model cannot call kanban_block # itself — we must do it on its behalf. + # + # We route through ``_record_task_failure(outcome="timed_out")`` + # rather than ``kanban_block`` so this counts toward the + # ``consecutive_failures`` counter and the dispatcher's + # ``failure_limit`` circuit breaker (#29747 gap 2). Without this, + # a task whose worker keeps exhausting its budget would block + # silently each run, get auto-promoted by the operator (or never + # surface), and re-block in an endless loop with no signal. _kanban_task = os.environ.get("HERMES_KANBAN_TASK") if _kanban_task: try: - _ra().handle_function_call( - "kanban_block", - { - "task_id": _kanban_task, - "reason": ( + from hermes_cli import kanban_db as _kb + _conn = _kb.connect() + try: + _kb._record_task_failure( + _conn, + _kanban_task, + error=( f"Iteration budget exhausted " f"({api_call_count}/{agent.max_iterations}) — " "task could not complete within the allowed " "iterations" ), - }, - task_id=effective_task_id, - ) - logger.info( - "kanban_block called for task %s after iteration " - "exhaustion (%d/%d)", - _kanban_task, api_call_count, agent.max_iterations, - ) + outcome="timed_out", + release_claim=True, + end_run=True, + event_payload_extra={ + "budget_used": api_call_count, + "budget_max": agent.max_iterations, + }, + ) + logger.info( + "recorded budget-exhausted failure for task %s (%d/%d)", + _kanban_task, api_call_count, agent.max_iterations, + ) + finally: + try: + _conn.close() + except Exception: + pass except Exception: logger.warning( - "Failed to call kanban_block after iteration " - "exhaustion for task %s", + "Failed to record budget-exhausted failure for task %s", _kanban_task, exc_info=True, ) diff --git a/hermes_cli/kanban_db.py b/hermes_cli/kanban_db.py index 7b5267cc9..832dcd0da 100644 --- a/hermes_cli/kanban_db.py +++ b/hermes_cli/kanban_db.py @@ -110,6 +110,16 @@ _IS_WINDOWS = sys.platform == "win32" # long single-call MCP workflows. DEFAULT_CLAIM_TTL_SECONDS = 15 * 60 +# If a worker's PID is still alive but its ``last_heartbeat_at`` is +# older than this when ``release_stale_claims`` runs, treat the worker +# as wedged and reclaim regardless of PID liveness (#29747 gap 3). +# This catches the logic-loop case where the process is technically +# running but not making observable progress. ``_touch_activity`` +# bridges chunk-level liveness into ``last_heartbeat_at`` via #31752, +# so any genuinely active worker keeps its heartbeat fresh as a side +# effect of normal API traffic. +DEFAULT_CLAIM_HEARTBEAT_MAX_STALE_SECONDS = 60 * 60 + def _resolve_claim_ttl_seconds(ttl_seconds: Optional[int] = None) -> int: """Return the effective claim TTL, honoring the kanban env override. @@ -2740,9 +2750,19 @@ def release_stale_claims( then-immediately-reclaim loop seen on slow models that spend longer than ``DEFAULT_CLAIM_TTL_SECONDS`` inside a single tool-free LLM call (#23025): no tool calls means no ``kanban_heartbeat``, even - though the subprocess is healthy. ``enforce_max_runtime`` and - ``detect_crashed_workers`` remain the upper bounds for genuinely - wedged or dead workers. + though the subprocess is healthy. + + Backstop (#29747 gap 3): if the worker's PID is still alive but its + ``last_heartbeat_at`` is stale by more than + ``DEFAULT_CLAIM_HEARTBEAT_MAX_STALE_SECONDS`` (1h), the worker has + been making no observable progress and we reclaim anyway — even if + ``_pid_alive`` is still true. This catches the wedged-in-a-logic-loop + case where the process is technically running but accomplishing + nothing. ``_touch_activity`` (run_agent.py) bridges chunk-level + liveness into ``last_heartbeat_at`` via #31752, so any genuinely + active worker keeps its heartbeat fresh as a side effect of normal + API traffic. ``enforce_max_runtime`` and ``detect_crashed_workers`` + remain the upper bounds for genuinely wedged or dead workers. Returns the number of stale claims actually reclaimed (live-pid extensions don't count). Safe to call often. @@ -2760,7 +2780,21 @@ def release_stale_claims( for row in stale: lock = row["claim_lock"] or "" host_local = lock.startswith(host_prefix) - if host_local and row["worker_pid"] and _pid_alive(row["worker_pid"]): + hb = row["last_heartbeat_at"] + # Heartbeat staleness backstop: if we have a heartbeat at all + # and it's older than the max-stale threshold, the worker is + # not making observable progress. Reclaim instead of extending, + # even if the PID is still alive (it's likely in a logic loop). + heartbeat_stale = ( + hb is not None + and (now - int(hb)) > DEFAULT_CLAIM_HEARTBEAT_MAX_STALE_SECONDS + ) + if ( + host_local + and row["worker_pid"] + and _pid_alive(row["worker_pid"]) + and not heartbeat_stale + ): new_expires = now + _resolve_claim_ttl_seconds() with write_txn(conn): cur = conn.execute( @@ -2829,6 +2863,7 @@ def release_stale_claims( ), "now": now, "host_local": host_local, + "heartbeat_stale": bool(heartbeat_stale), } payload.update(termination) _append_event( diff --git a/hermes_cli/kanban_diagnostics.py b/hermes_cli/kanban_diagnostics.py index bed5a6ebc..0e7fd052b 100644 --- a/hermes_cli/kanban_diagnostics.py +++ b/hermes_cli/kanban_diagnostics.py @@ -791,6 +791,83 @@ def _rule_stuck_in_blocked(task, events, runs, now, cfg) -> list[Diagnostic]: )] +def _rule_block_unblock_cycling(task, events, runs, now, cfg) -> list[Diagnostic]: + """Task has cycled through blocked → unblocked many times — the + ``unblock`` is not fixing the underlying problem and the worker + keeps re-blocking for substantially the same reason. + + ``_rule_stuck_in_blocked`` resets its timer on any ``commented`` / + ``unblocked`` event, so a task that cycles every few minutes is + invisible to it regardless of how many times it cycles (#29747 + gap 1). This rule complements that one by counting block→unblock + cycles in a sliding window. + + Threshold: cfg["block_cycle_threshold"] (default 3) cycles within + cfg["block_cycle_window_seconds"] (default 24h). + """ + threshold = _positive_int(cfg.get("block_cycle_threshold"), 3) + window_seconds = float(cfg.get("block_cycle_window_seconds", 24 * 3600)) + cycle_cutoff = now - window_seconds + + # Walk events chronologically (arrival order — callers pre-sort by + # id, which is the canonical chronological order; ``created_at`` + # alone is insufficient because multiple events can share the same + # second). Count "blocked after unblocked" transitions: every time + # a blocked event follows at least one unblocked event since the + # last cycle was counted, that's a new cycle. + cycles = 0 + seen_unblock_since_last_cycle = False + initial_blocked_ts = 0 + last_cycle_blocked_ts = 0 + for ev in events: + ts = _event_ts(ev) + if ts < cycle_cutoff: + continue + kind = _event_kind(ev) + if kind == "blocked": + if initial_blocked_ts == 0: + initial_blocked_ts = ts + if seen_unblock_since_last_cycle: + cycles += 1 + last_cycle_blocked_ts = ts + seen_unblock_since_last_cycle = False + elif kind == "unblocked": + seen_unblock_since_last_cycle = True + + if cycles < threshold: + return [] + + task_id = _task_field(task, "id") + actions: list[DiagnosticAction] = [] + if task_id: + actions.append(DiagnosticAction( + kind="cli_hint", + label=f"Check block reasons: hermes kanban events {task_id}", + payload={"command": f"hermes kanban events {task_id}"}, + suggested=True, + )) + return [Diagnostic( + kind="block_unblock_cycling", + severity="warning", + title=f"Task block→unblock cycled {cycles}x in {int(window_seconds/3600)}h", + detail=( + f"This task has been blocked {cycles} times after being " + "unblocked, suggesting the unblock is not addressing the " + "root cause and the worker keeps hitting the same wall. " + "Review the block reasons in the event history; a different " + "intervention (reassign, change scope, archive) may be needed." + ), + actions=actions, + first_seen_at=int(initial_blocked_ts) if initial_blocked_ts else int(now), + last_seen_at=int(last_cycle_blocked_ts) if last_cycle_blocked_ts else int(now), + count=cycles, + data={ + "cycles": cycles, + "window_seconds": int(window_seconds), + }, + )] + + def _rule_stranded_in_ready(task, events, runs, now, cfg) -> list[Diagnostic]: """Task has been in ``ready`` status for too long without any worker claiming it. @@ -923,6 +1000,7 @@ _RULES: list[RuleFn] = [ _rule_repeated_failures, _rule_repeated_crashes, _rule_stuck_in_blocked, + _rule_block_unblock_cycling, _rule_stranded_in_ready, ] @@ -936,6 +1014,7 @@ DIAGNOSTIC_KINDS = ( "repeated_failures", "repeated_crashes", "stuck_in_blocked", + "block_unblock_cycling", "stranded_in_ready", ) diff --git a/scripts/release.py b/scripts/release.py index 4632362ae..aa707626e 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -62,6 +62,7 @@ AUTHOR_MAP = { "211828103+julio-cloudvisor@users.noreply.github.com": "julio-cloudvisor", "17778+kweiner@users.noreply.github.com": "kweiner", "223516181+faisfamilytravel@users.noreply.github.com": "faisfamilytravel", + "45189813+baofuen@users.noreply.github.com": "baofuen", # teknium (multiple emails) "teknium1@gmail.com": "teknium1", "kenyon1977@gmail.com": "kenyonxu",