fix(kanban): prevent infinite retry loop when worker exhausts iteration budget
recompute_ready() previously reset consecutive_failures to 0 when auto-recovering a blocked task. This defeated the circuit-breaker: a task that repeatedly exhausted its iteration budget would cycle forever (block → auto-recover with counter=0 → respawn → budget exhausted → block → …) with no signal to the operator. Fix: don't auto-recover tasks whose consecutive_failures has reached the effective failure limit (per-task max_retries or DEFAULT_FAILURE_LIMIT). The counter is also preserved across recovery so the breaker can accumulate across cycles. Fixes #35072
This commit is contained in:
@ -2599,17 +2599,24 @@ def recompute_ready(conn: sqlite3.Connection) -> int:
|
||||
|
||||
``blocked`` tasks are also considered for promotion (so a task
|
||||
blocked purely by a parent dependency unblocks itself when the
|
||||
parent completes), *except* when the most recent block event was a
|
||||
worker-initiated ``kanban_block`` — those stay blocked until an
|
||||
explicit ``kanban_unblock`` (#28712). Without that guard, a
|
||||
``review-required`` handoff would auto-respawn, the fresh worker
|
||||
would find nothing to do, exit cleanly, get recorded as a protocol
|
||||
violation, and the cycle would repeat indefinitely.
|
||||
parent completes), *except* in two cases:
|
||||
|
||||
1. The most recent block event was a worker-initiated
|
||||
``kanban_block`` — those stay blocked until an explicit
|
||||
``kanban_unblock`` (#28712).
|
||||
|
||||
2. The task's ``consecutive_failures`` has reached the effective
|
||||
failure limit (per-task ``max_retries`` or
|
||||
``DEFAULT_FAILURE_LIMIT``). This prevents infinite retry
|
||||
loops when a task repeatedly exhausts its iteration budget:
|
||||
without this guard the counter would reset on every recovery
|
||||
cycle and the circuit breaker could never trip.
|
||||
"""
|
||||
promoted = 0
|
||||
with write_txn(conn):
|
||||
todo_rows = conn.execute(
|
||||
"SELECT id, status FROM tasks WHERE status IN ('todo', 'blocked')"
|
||||
"SELECT id, status, consecutive_failures, max_retries "
|
||||
"FROM tasks WHERE status IN ('todo', 'blocked')"
|
||||
).fetchall()
|
||||
for row in todo_rows:
|
||||
task_id = row["id"]
|
||||
@ -2627,13 +2634,25 @@ def recompute_ready(conn: sqlite3.Connection) -> int:
|
||||
(task_id,),
|
||||
).fetchall()
|
||||
if all(p["status"] in ("done", "archived") for p in parents):
|
||||
# Blocked tasks also get their failure counters reset —
|
||||
# this is effectively an auto-unblock (circuit-breaker
|
||||
# recovery; worker-initiated blocks are skipped above).
|
||||
if cur_status == "blocked":
|
||||
# Don't auto-recover tasks that have hit the
|
||||
# circuit-breaker failure limit. Without this
|
||||
# guard, a task that repeatedly exhausts its
|
||||
# iteration budget would cycle forever:
|
||||
# block → auto-recover → respawn → budget
|
||||
# exhausted → block → … The counter must also
|
||||
# be preserved so the breaker can accumulate
|
||||
# across recovery cycles.
|
||||
failures = int(row["consecutive_failures"] or 0)
|
||||
task_limit = row["max_retries"]
|
||||
effective_limit = (
|
||||
int(task_limit) if task_limit is not None
|
||||
else DEFAULT_FAILURE_LIMIT
|
||||
)
|
||||
if failures >= effective_limit:
|
||||
continue
|
||||
conn.execute(
|
||||
"UPDATE tasks SET status = 'ready', "
|
||||
"consecutive_failures = 0, last_failure_error = NULL "
|
||||
"UPDATE tasks SET status = 'ready' "
|
||||
"WHERE id = ? AND status = 'blocked'",
|
||||
(task_id,),
|
||||
)
|
||||
|
||||
@ -307,7 +307,8 @@ def test_recompute_ready_cascades_through_chain(kanban_home):
|
||||
|
||||
|
||||
def test_recompute_ready_promotes_blocked_with_done_parents(kanban_home):
|
||||
"""blocked tasks with all parents done should be promoted to ready."""
|
||||
"""blocked tasks with all parents done should be promoted to ready,
|
||||
unless the circuit-breaker failure limit has been reached."""
|
||||
with kb.connect() as conn:
|
||||
parent = kb.create_task(conn, title="parent", assignee="a")
|
||||
child = kb.create_task(
|
||||
@ -316,16 +317,16 @@ def test_recompute_ready_promotes_blocked_with_done_parents(kanban_home):
|
||||
# Complete the parent
|
||||
kb.claim_task(conn, parent)
|
||||
kb.complete_task(conn, parent, result="ok")
|
||||
# Manually block the child (simulates a worker that failed
|
||||
# after the parent finished)
|
||||
# Manually block the child with zero failures (simulates a
|
||||
# dependency block, not a circuit-breaker block).
|
||||
conn.execute(
|
||||
"UPDATE tasks SET status='blocked', consecutive_failures=5, "
|
||||
"last_failure_error='persistent error' WHERE id=?",
|
||||
"UPDATE tasks SET status='blocked', consecutive_failures=0, "
|
||||
"last_failure_error=NULL WHERE id=?",
|
||||
(child,),
|
||||
)
|
||||
conn.commit()
|
||||
assert kb.get_task(conn, child).status == "blocked"
|
||||
# recompute_ready should promote blocked → ready and reset failures
|
||||
# recompute_ready should promote blocked → ready
|
||||
promoted = kb.recompute_ready(conn)
|
||||
assert promoted == 1
|
||||
task = kb.get_task(conn, child)
|
||||
@ -815,6 +816,82 @@ def test_unblock_resets_failure_counters(kanban_home):
|
||||
assert task.last_failure_error is None
|
||||
|
||||
|
||||
def test_recompute_ready_skips_tasks_at_failure_limit(kanban_home):
|
||||
"""recompute_ready must not auto-recover tasks whose consecutive_failures
|
||||
has reached the circuit-breaker limit (#35072).
|
||||
|
||||
Without this guard, a task that repeatedly exhausts its iteration
|
||||
budget would cycle forever: block → auto-recover (counter reset)
|
||||
→ respawn → budget exhausted → block → …
|
||||
"""
|
||||
with kb.connect() as conn:
|
||||
parent = kb.create_task(conn, title="parent", assignee="a")
|
||||
child = kb.create_task(conn, title="child", assignee="a",
|
||||
parents=[parent])
|
||||
# Complete the parent so the child's dependencies are satisfied.
|
||||
kb.claim_task(conn, parent)
|
||||
kb.complete_task(conn, parent, summary="done")
|
||||
|
||||
# Simulate the child having exhausted its budget twice,
|
||||
# hitting the default failure limit (2).
|
||||
kb.claim_task(conn, child)
|
||||
kb._record_task_failure(
|
||||
conn, child, error="budget exhausted 1",
|
||||
outcome="timed_out", release_claim=True, end_run=True,
|
||||
failure_limit=2,
|
||||
)
|
||||
kb._record_task_failure(
|
||||
conn, child, error="budget exhausted 2",
|
||||
outcome="timed_out", release_claim=True, end_run=True,
|
||||
failure_limit=2,
|
||||
)
|
||||
task = kb.get_task(conn, child)
|
||||
assert task.status == "blocked"
|
||||
assert task.consecutive_failures >= 2
|
||||
|
||||
# recompute_ready must NOT promote this task — the circuit
|
||||
# breaker has tripped and it should stay blocked.
|
||||
promoted = kb.recompute_ready(conn)
|
||||
assert promoted == 0
|
||||
assert kb.get_task(conn, child).status == "blocked"
|
||||
|
||||
# Explicit unblock should still work and reset the counter.
|
||||
assert kb.unblock_task(conn, child)
|
||||
task = kb.get_task(conn, child)
|
||||
assert task.status == "ready"
|
||||
assert task.consecutive_failures == 0
|
||||
|
||||
|
||||
def test_recompute_ready_recovers_below_limit(kanban_home):
|
||||
"""recompute_ready auto-recovers blocked tasks that haven't hit the
|
||||
failure limit yet — the counter is preserved across recovery."""
|
||||
with kb.connect() as conn:
|
||||
t = kb.create_task(conn, title="task", assignee="a")
|
||||
kb.claim_task(conn, t)
|
||||
# One failure, below the default limit of 2.
|
||||
kb._record_task_failure(
|
||||
conn, t, error="budget exhausted 1",
|
||||
outcome="timed_out", release_claim=True, end_run=True,
|
||||
failure_limit=2,
|
||||
)
|
||||
task = kb.get_task(conn, t)
|
||||
assert task.status == "ready"
|
||||
assert task.consecutive_failures == 1
|
||||
|
||||
# Simulate being blocked by something else (not circuit breaker).
|
||||
conn.execute(
|
||||
"UPDATE tasks SET status = 'blocked' WHERE id = ?", (t,),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
promoted = kb.recompute_ready(conn)
|
||||
assert promoted == 1
|
||||
task = kb.get_task(conn, t)
|
||||
assert task.status == "ready"
|
||||
# Counter must be preserved, not reset.
|
||||
assert task.consecutive_failures == 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parent-completion invariant at the claim gate (RCA t_a6acd07d)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user