From 6ab71d3bb4cca36712b6895fd1bcc38fd3b9be4f Mon Sep 17 00:00:00 2001 From: liuhao1024 Date: Sat, 30 May 2026 09:34:17 +0800 Subject: [PATCH] fix(kanban): prevent infinite retry loop when worker exhausts iteration budget MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit recompute_ready() previously reset consecutive_failures to 0 when auto-recovering a blocked task. This defeated the circuit-breaker: a task that repeatedly exhausted its iteration budget would cycle forever (block → auto-recover with counter=0 → respawn → budget exhausted → block → …) with no signal to the operator. Fix: don't auto-recover tasks whose consecutive_failures has reached the effective failure limit (per-task max_retries or DEFAULT_FAILURE_LIMIT). The counter is also preserved across recovery so the breaker can accumulate across cycles. Fixes #35072 --- hermes_cli/kanban_db.py | 43 +++++++++++---- tests/hermes_cli/test_kanban_db.py | 89 ++++++++++++++++++++++++++++-- 2 files changed, 114 insertions(+), 18 deletions(-) diff --git a/hermes_cli/kanban_db.py b/hermes_cli/kanban_db.py index c0e8372c7..95ba0d55c 100644 --- a/hermes_cli/kanban_db.py +++ b/hermes_cli/kanban_db.py @@ -2599,17 +2599,24 @@ def recompute_ready(conn: sqlite3.Connection) -> int: ``blocked`` tasks are also considered for promotion (so a task blocked purely by a parent dependency unblocks itself when the - parent completes), *except* when the most recent block event was a - worker-initiated ``kanban_block`` — those stay blocked until an - explicit ``kanban_unblock`` (#28712). Without that guard, a - ``review-required`` handoff would auto-respawn, the fresh worker - would find nothing to do, exit cleanly, get recorded as a protocol - violation, and the cycle would repeat indefinitely. + parent completes), *except* in two cases: + + 1. The most recent block event was a worker-initiated + ``kanban_block`` — those stay blocked until an explicit + ``kanban_unblock`` (#28712). + + 2. The task's ``consecutive_failures`` has reached the effective + failure limit (per-task ``max_retries`` or + ``DEFAULT_FAILURE_LIMIT``). This prevents infinite retry + loops when a task repeatedly exhausts its iteration budget: + without this guard the counter would reset on every recovery + cycle and the circuit breaker could never trip. """ promoted = 0 with write_txn(conn): todo_rows = conn.execute( - "SELECT id, status FROM tasks WHERE status IN ('todo', 'blocked')" + "SELECT id, status, consecutive_failures, max_retries " + "FROM tasks WHERE status IN ('todo', 'blocked')" ).fetchall() for row in todo_rows: task_id = row["id"] @@ -2627,13 +2634,25 @@ def recompute_ready(conn: sqlite3.Connection) -> int: (task_id,), ).fetchall() if all(p["status"] in ("done", "archived") for p in parents): - # Blocked tasks also get their failure counters reset — - # this is effectively an auto-unblock (circuit-breaker - # recovery; worker-initiated blocks are skipped above). if cur_status == "blocked": + # Don't auto-recover tasks that have hit the + # circuit-breaker failure limit. Without this + # guard, a task that repeatedly exhausts its + # iteration budget would cycle forever: + # block → auto-recover → respawn → budget + # exhausted → block → … The counter must also + # be preserved so the breaker can accumulate + # across recovery cycles. + failures = int(row["consecutive_failures"] or 0) + task_limit = row["max_retries"] + effective_limit = ( + int(task_limit) if task_limit is not None + else DEFAULT_FAILURE_LIMIT + ) + if failures >= effective_limit: + continue conn.execute( - "UPDATE tasks SET status = 'ready', " - "consecutive_failures = 0, last_failure_error = NULL " + "UPDATE tasks SET status = 'ready' " "WHERE id = ? AND status = 'blocked'", (task_id,), ) diff --git a/tests/hermes_cli/test_kanban_db.py b/tests/hermes_cli/test_kanban_db.py index 020ad4fb4..8e3331891 100644 --- a/tests/hermes_cli/test_kanban_db.py +++ b/tests/hermes_cli/test_kanban_db.py @@ -307,7 +307,8 @@ def test_recompute_ready_cascades_through_chain(kanban_home): def test_recompute_ready_promotes_blocked_with_done_parents(kanban_home): - """blocked tasks with all parents done should be promoted to ready.""" + """blocked tasks with all parents done should be promoted to ready, + unless the circuit-breaker failure limit has been reached.""" with kb.connect() as conn: parent = kb.create_task(conn, title="parent", assignee="a") child = kb.create_task( @@ -316,16 +317,16 @@ def test_recompute_ready_promotes_blocked_with_done_parents(kanban_home): # Complete the parent kb.claim_task(conn, parent) kb.complete_task(conn, parent, result="ok") - # Manually block the child (simulates a worker that failed - # after the parent finished) + # Manually block the child with zero failures (simulates a + # dependency block, not a circuit-breaker block). conn.execute( - "UPDATE tasks SET status='blocked', consecutive_failures=5, " - "last_failure_error='persistent error' WHERE id=?", + "UPDATE tasks SET status='blocked', consecutive_failures=0, " + "last_failure_error=NULL WHERE id=?", (child,), ) conn.commit() assert kb.get_task(conn, child).status == "blocked" - # recompute_ready should promote blocked → ready and reset failures + # recompute_ready should promote blocked → ready promoted = kb.recompute_ready(conn) assert promoted == 1 task = kb.get_task(conn, child) @@ -815,6 +816,82 @@ def test_unblock_resets_failure_counters(kanban_home): assert task.last_failure_error is None +def test_recompute_ready_skips_tasks_at_failure_limit(kanban_home): + """recompute_ready must not auto-recover tasks whose consecutive_failures + has reached the circuit-breaker limit (#35072). + + Without this guard, a task that repeatedly exhausts its iteration + budget would cycle forever: block → auto-recover (counter reset) + → respawn → budget exhausted → block → … + """ + with kb.connect() as conn: + parent = kb.create_task(conn, title="parent", assignee="a") + child = kb.create_task(conn, title="child", assignee="a", + parents=[parent]) + # Complete the parent so the child's dependencies are satisfied. + kb.claim_task(conn, parent) + kb.complete_task(conn, parent, summary="done") + + # Simulate the child having exhausted its budget twice, + # hitting the default failure limit (2). + kb.claim_task(conn, child) + kb._record_task_failure( + conn, child, error="budget exhausted 1", + outcome="timed_out", release_claim=True, end_run=True, + failure_limit=2, + ) + kb._record_task_failure( + conn, child, error="budget exhausted 2", + outcome="timed_out", release_claim=True, end_run=True, + failure_limit=2, + ) + task = kb.get_task(conn, child) + assert task.status == "blocked" + assert task.consecutive_failures >= 2 + + # recompute_ready must NOT promote this task — the circuit + # breaker has tripped and it should stay blocked. + promoted = kb.recompute_ready(conn) + assert promoted == 0 + assert kb.get_task(conn, child).status == "blocked" + + # Explicit unblock should still work and reset the counter. + assert kb.unblock_task(conn, child) + task = kb.get_task(conn, child) + assert task.status == "ready" + assert task.consecutive_failures == 0 + + +def test_recompute_ready_recovers_below_limit(kanban_home): + """recompute_ready auto-recovers blocked tasks that haven't hit the + failure limit yet — the counter is preserved across recovery.""" + with kb.connect() as conn: + t = kb.create_task(conn, title="task", assignee="a") + kb.claim_task(conn, t) + # One failure, below the default limit of 2. + kb._record_task_failure( + conn, t, error="budget exhausted 1", + outcome="timed_out", release_claim=True, end_run=True, + failure_limit=2, + ) + task = kb.get_task(conn, t) + assert task.status == "ready" + assert task.consecutive_failures == 1 + + # Simulate being blocked by something else (not circuit breaker). + conn.execute( + "UPDATE tasks SET status = 'blocked' WHERE id = ?", (t,), + ) + conn.commit() + + promoted = kb.recompute_ready(conn) + assert promoted == 1 + task = kb.get_task(conn, t) + assert task.status == "ready" + # Counter must be preserved, not reset. + assert task.consecutive_failures == 1 + + # --------------------------------------------------------------------------- # Parent-completion invariant at the claim gate (RCA t_a6acd07d) # ---------------------------------------------------------------------------