fix(kanban): prevent infinite retry loop when worker exhausts iteration budget

recompute_ready() previously reset consecutive_failures to 0 when
auto-recovering a blocked task.  This defeated the circuit-breaker:
a task that repeatedly exhausted its iteration budget would cycle
forever (block → auto-recover with counter=0 → respawn → budget
exhausted → block → …) with no signal to the operator.

Fix: don't auto-recover tasks whose consecutive_failures has reached
the effective failure limit (per-task max_retries or
DEFAULT_FAILURE_LIMIT).  The counter is also preserved across
recovery so the breaker can accumulate across cycles.

Fixes #35072
This commit is contained in:
liuhao1024
2026-05-30 09:34:17 +08:00
committed by Teknium
parent c70dca3a88
commit 6ab71d3bb4
2 changed files with 114 additions and 18 deletions

View File

@ -2599,17 +2599,24 @@ def recompute_ready(conn: sqlite3.Connection) -> int:
``blocked`` tasks are also considered for promotion (so a task
blocked purely by a parent dependency unblocks itself when the
parent completes), *except* when the most recent block event was a
worker-initiated ``kanban_block`` — those stay blocked until an
explicit ``kanban_unblock`` (#28712). Without that guard, a
``review-required`` handoff would auto-respawn, the fresh worker
would find nothing to do, exit cleanly, get recorded as a protocol
violation, and the cycle would repeat indefinitely.
parent completes), *except* in two cases:
1. The most recent block event was a worker-initiated
``kanban_block`` — those stay blocked until an explicit
``kanban_unblock`` (#28712).
2. The task's ``consecutive_failures`` has reached the effective
failure limit (per-task ``max_retries`` or
``DEFAULT_FAILURE_LIMIT``). This prevents infinite retry
loops when a task repeatedly exhausts its iteration budget:
without this guard the counter would reset on every recovery
cycle and the circuit breaker could never trip.
"""
promoted = 0
with write_txn(conn):
todo_rows = conn.execute(
"SELECT id, status FROM tasks WHERE status IN ('todo', 'blocked')"
"SELECT id, status, consecutive_failures, max_retries "
"FROM tasks WHERE status IN ('todo', 'blocked')"
).fetchall()
for row in todo_rows:
task_id = row["id"]
@ -2627,13 +2634,25 @@ def recompute_ready(conn: sqlite3.Connection) -> int:
(task_id,),
).fetchall()
if all(p["status"] in ("done", "archived") for p in parents):
# Blocked tasks also get their failure counters reset —
# this is effectively an auto-unblock (circuit-breaker
# recovery; worker-initiated blocks are skipped above).
if cur_status == "blocked":
# Don't auto-recover tasks that have hit the
# circuit-breaker failure limit. Without this
# guard, a task that repeatedly exhausts its
# iteration budget would cycle forever:
# block → auto-recover → respawn → budget
# exhausted → block → … The counter must also
# be preserved so the breaker can accumulate
# across recovery cycles.
failures = int(row["consecutive_failures"] or 0)
task_limit = row["max_retries"]
effective_limit = (
int(task_limit) if task_limit is not None
else DEFAULT_FAILURE_LIMIT
)
if failures >= effective_limit:
continue
conn.execute(
"UPDATE tasks SET status = 'ready', "
"consecutive_failures = 0, last_failure_error = NULL "
"UPDATE tasks SET status = 'ready' "
"WHERE id = ? AND status = 'blocked'",
(task_id,),
)

View File

@ -307,7 +307,8 @@ def test_recompute_ready_cascades_through_chain(kanban_home):
def test_recompute_ready_promotes_blocked_with_done_parents(kanban_home):
"""blocked tasks with all parents done should be promoted to ready."""
"""blocked tasks with all parents done should be promoted to ready,
unless the circuit-breaker failure limit has been reached."""
with kb.connect() as conn:
parent = kb.create_task(conn, title="parent", assignee="a")
child = kb.create_task(
@ -316,16 +317,16 @@ def test_recompute_ready_promotes_blocked_with_done_parents(kanban_home):
# Complete the parent
kb.claim_task(conn, parent)
kb.complete_task(conn, parent, result="ok")
# Manually block the child (simulates a worker that failed
# after the parent finished)
# Manually block the child with zero failures (simulates a
# dependency block, not a circuit-breaker block).
conn.execute(
"UPDATE tasks SET status='blocked', consecutive_failures=5, "
"last_failure_error='persistent error' WHERE id=?",
"UPDATE tasks SET status='blocked', consecutive_failures=0, "
"last_failure_error=NULL WHERE id=?",
(child,),
)
conn.commit()
assert kb.get_task(conn, child).status == "blocked"
# recompute_ready should promote blocked → ready and reset failures
# recompute_ready should promote blocked → ready
promoted = kb.recompute_ready(conn)
assert promoted == 1
task = kb.get_task(conn, child)
@ -815,6 +816,82 @@ def test_unblock_resets_failure_counters(kanban_home):
assert task.last_failure_error is None
def test_recompute_ready_skips_tasks_at_failure_limit(kanban_home):
"""recompute_ready must not auto-recover tasks whose consecutive_failures
has reached the circuit-breaker limit (#35072).
Without this guard, a task that repeatedly exhausts its iteration
budget would cycle forever: block → auto-recover (counter reset)
→ respawn → budget exhausted → block → …
"""
with kb.connect() as conn:
parent = kb.create_task(conn, title="parent", assignee="a")
child = kb.create_task(conn, title="child", assignee="a",
parents=[parent])
# Complete the parent so the child's dependencies are satisfied.
kb.claim_task(conn, parent)
kb.complete_task(conn, parent, summary="done")
# Simulate the child having exhausted its budget twice,
# hitting the default failure limit (2).
kb.claim_task(conn, child)
kb._record_task_failure(
conn, child, error="budget exhausted 1",
outcome="timed_out", release_claim=True, end_run=True,
failure_limit=2,
)
kb._record_task_failure(
conn, child, error="budget exhausted 2",
outcome="timed_out", release_claim=True, end_run=True,
failure_limit=2,
)
task = kb.get_task(conn, child)
assert task.status == "blocked"
assert task.consecutive_failures >= 2
# recompute_ready must NOT promote this task — the circuit
# breaker has tripped and it should stay blocked.
promoted = kb.recompute_ready(conn)
assert promoted == 0
assert kb.get_task(conn, child).status == "blocked"
# Explicit unblock should still work and reset the counter.
assert kb.unblock_task(conn, child)
task = kb.get_task(conn, child)
assert task.status == "ready"
assert task.consecutive_failures == 0
def test_recompute_ready_recovers_below_limit(kanban_home):
"""recompute_ready auto-recovers blocked tasks that haven't hit the
failure limit yet — the counter is preserved across recovery."""
with kb.connect() as conn:
t = kb.create_task(conn, title="task", assignee="a")
kb.claim_task(conn, t)
# One failure, below the default limit of 2.
kb._record_task_failure(
conn, t, error="budget exhausted 1",
outcome="timed_out", release_claim=True, end_run=True,
failure_limit=2,
)
task = kb.get_task(conn, t)
assert task.status == "ready"
assert task.consecutive_failures == 1
# Simulate being blocked by something else (not circuit breaker).
conn.execute(
"UPDATE tasks SET status = 'blocked' WHERE id = ?", (t,),
)
conn.commit()
promoted = kb.recompute_ready(conn)
assert promoted == 1
task = kb.get_task(conn, t)
assert task.status == "ready"
# Counter must be preserved, not reset.
assert task.consecutive_failures == 1
# ---------------------------------------------------------------------------
# Parent-completion invariant at the claim gate (RCA t_a6acd07d)
# ---------------------------------------------------------------------------