fix(tools): wrap _run_tool cleanup in finally to prevent interrupt state leak
When _invoke_tool raises a BaseException (CancelledError, KeyboardInterrupt), the cleanup code at the end of _run_tool was bypassed because it sat outside the except block (which only catches Exception). ThreadPoolExecutor recycles thread IDs, so the leaked tid in _interrupted_threads poisons the next tool scheduled on that thread — it instantly aborts with 'Interrupted'. Move the discard + _set_interrupt(False) into a finally block so cleanup runs regardless of how the worker exits. Fixes #35309
This commit is contained in:
@ -306,33 +306,38 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
|
||||
# submit site below (GHSA-qg5c-hvr5-hjgr, #13617).
|
||||
start = time.time()
|
||||
try:
|
||||
result = agent._invoke_tool(
|
||||
function_name,
|
||||
function_args,
|
||||
effective_task_id,
|
||||
tool_call.id,
|
||||
messages=messages,
|
||||
pre_tool_block_checked=True,
|
||||
)
|
||||
except Exception as tool_error:
|
||||
result = f"Error executing tool '{function_name}': {tool_error}"
|
||||
logger.error("_invoke_tool raised for %s: %s", function_name, tool_error, exc_info=True)
|
||||
duration = time.time() - start
|
||||
is_error, _ = _detect_tool_failure(function_name, result)
|
||||
if is_error:
|
||||
logger.info("tool %s failed (%.2fs): %s", function_name, duration, result[:200])
|
||||
else:
|
||||
logger.info("tool %s completed (%.2fs, %d chars)", function_name, duration, len(result))
|
||||
results[index] = (function_name, function_args, result, duration, is_error, False)
|
||||
# Tear down worker-tid tracking. Clear any interrupt bit we may
|
||||
# have set so the next task scheduled onto this recycled tid
|
||||
# starts with a clean slate.
|
||||
with agent._tool_worker_threads_lock:
|
||||
agent._tool_worker_threads.discard(_worker_tid)
|
||||
try:
|
||||
_ra()._set_interrupt(False, _worker_tid)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
result = agent._invoke_tool(
|
||||
function_name,
|
||||
function_args,
|
||||
effective_task_id,
|
||||
tool_call.id,
|
||||
messages=messages,
|
||||
pre_tool_block_checked=True,
|
||||
)
|
||||
except Exception as tool_error:
|
||||
result = f"Error executing tool '{function_name}': {tool_error}"
|
||||
logger.error("_invoke_tool raised for %s: %s", function_name, tool_error, exc_info=True)
|
||||
duration = time.time() - start
|
||||
is_error, _ = _detect_tool_failure(function_name, result)
|
||||
if is_error:
|
||||
logger.info("tool %s failed (%.2fs): %s", function_name, duration, result[:200])
|
||||
else:
|
||||
logger.info("tool %s completed (%.2fs, %d chars)", function_name, duration, len(result))
|
||||
results[index] = (function_name, function_args, result, duration, is_error, False)
|
||||
finally:
|
||||
# Tear down worker-tid tracking. Clear any interrupt bit we may
|
||||
# have set so the next task scheduled onto this recycled tid
|
||||
# starts with a clean slate. This MUST be in a finally block
|
||||
# because BaseException subclasses (CancelledError, KeyboardInterrupt)
|
||||
# bypass ``except Exception`` and would otherwise leak the tid
|
||||
# into _interrupted_threads, poisoning the recycled thread.
|
||||
with agent._tool_worker_threads_lock:
|
||||
agent._tool_worker_threads.discard(_worker_tid)
|
||||
try:
|
||||
_ra()._set_interrupt(False, _worker_tid)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Start spinner for CLI mode (skip when TUI handles tool progress)
|
||||
spinner = None
|
||||
|
||||
Reference in New Issue
Block a user