fix(tools): wrap _run_tool cleanup in finally to prevent interrupt state leak

When _invoke_tool raises a BaseException (CancelledError, KeyboardInterrupt),
the cleanup code at the end of _run_tool was bypassed because it sat outside
the except block (which only catches Exception).  ThreadPoolExecutor recycles
thread IDs, so the leaked tid in _interrupted_threads poisons the next tool
scheduled on that thread — it instantly aborts with 'Interrupted'.

Move the discard + _set_interrupt(False) into a finally block so cleanup
runs regardless of how the worker exits.

Fixes #35309
This commit is contained in:
liuhao1024
2026-05-30 19:45:18 +08:00
committed by Teknium
parent 2b16b756a7
commit bede3cf12d
2 changed files with 113 additions and 27 deletions

View File

@ -306,33 +306,38 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
# submit site below (GHSA-qg5c-hvr5-hjgr, #13617).
start = time.time()
try:
result = agent._invoke_tool(
function_name,
function_args,
effective_task_id,
tool_call.id,
messages=messages,
pre_tool_block_checked=True,
)
except Exception as tool_error:
result = f"Error executing tool '{function_name}': {tool_error}"
logger.error("_invoke_tool raised for %s: %s", function_name, tool_error, exc_info=True)
duration = time.time() - start
is_error, _ = _detect_tool_failure(function_name, result)
if is_error:
logger.info("tool %s failed (%.2fs): %s", function_name, duration, result[:200])
else:
logger.info("tool %s completed (%.2fs, %d chars)", function_name, duration, len(result))
results[index] = (function_name, function_args, result, duration, is_error, False)
# Tear down worker-tid tracking. Clear any interrupt bit we may
# have set so the next task scheduled onto this recycled tid
# starts with a clean slate.
with agent._tool_worker_threads_lock:
agent._tool_worker_threads.discard(_worker_tid)
try:
_ra()._set_interrupt(False, _worker_tid)
except Exception:
pass
try:
result = agent._invoke_tool(
function_name,
function_args,
effective_task_id,
tool_call.id,
messages=messages,
pre_tool_block_checked=True,
)
except Exception as tool_error:
result = f"Error executing tool '{function_name}': {tool_error}"
logger.error("_invoke_tool raised for %s: %s", function_name, tool_error, exc_info=True)
duration = time.time() - start
is_error, _ = _detect_tool_failure(function_name, result)
if is_error:
logger.info("tool %s failed (%.2fs): %s", function_name, duration, result[:200])
else:
logger.info("tool %s completed (%.2fs, %d chars)", function_name, duration, len(result))
results[index] = (function_name, function_args, result, duration, is_error, False)
finally:
# Tear down worker-tid tracking. Clear any interrupt bit we may
# have set so the next task scheduled onto this recycled tid
# starts with a clean slate. This MUST be in a finally block
# because BaseException subclasses (CancelledError, KeyboardInterrupt)
# bypass ``except Exception`` and would otherwise leak the tid
# into _interrupted_threads, poisoning the recycled thread.
with agent._tool_worker_threads_lock:
agent._tool_worker_threads.discard(_worker_tid)
try:
_ra()._set_interrupt(False, _worker_tid)
except Exception:
pass
# Start spinner for CLI mode (skip when TUI handles tool progress)
spinner = None