fix(tools): isolate get_tool_definitions quiet_mode cache + dedup LCM injection (#17335)
Long-lived Gateway processes were sending duplicate tool names to providers that enforce uniqueness: - DeepSeek: 'Tool names must be unique.' - Xiaomi MiMo: 'tools contains duplicate names: lcm_expand' - Moonshot/Kimi: 'function name lcm_grep is duplicated' TUI was unaffected because TUI runs with quiet_mode=False and skips the cache entirely. Root cause (two layered bugs) - model_tools.get_tool_definitions(quiet_mode=True) memoizes its result in _tool_defs_cache. The cache-hit path returned list(cached) (safe), but the FIRST uncached call stored and returned the SAME object. run_agent.py mutates self.tools (memory + LCM context-engine schemas) in-place, so the very first agent init in a Gateway process poisoned the cache, and every subsequent init appended LCM schemas again on top of the already-polluted list. - run_agent.py's context-engine injection (lcm_grep / lcm_describe / lcm_expand) had no dedup, unlike the memory-tools injection right above it which already skips already-present names. Fix (defense in depth, per the issue's suggested fix) - model_tools.get_tool_definitions: on the uncached branch, cache the computed list but return list(result) to the caller. Same pattern as the cache-hit path. - run_agent.py: build _existing_tool_names from self.tools and skip schemas whose names are already present, mirroring the memory-tools block. This also defends against plugin paths that may register the same schemas via ctx.register_tool(). Tests (tests/test_get_tool_definitions_cache_isolation.py) - test_first_uncached_call_returns_fresh_list \u2014 pins the fix; without it, first-call alias caused all the symptoms. - test_cache_hit_returns_fresh_list \u2014 pre-existing behavior stays. - test_caller_mutation_does_not_poison_cache \u2014 simulates run_agent appending lcm_grep / lcm_expand to the returned list and asserts the next call doesn't see them. - test_repeated_caller_mutation_does_not_accumulate \u2014 reproduces the long-lived Gateway accumulation pattern across 5 agent inits. - test_non_quiet_mode_does_not_use_cache \u2014 sanity, explains why TUI was fine. 5/5 pass on the new file; 23/23 still pass on tests/test_model_tools.py.
This commit is contained in:
@ -320,7 +320,15 @@ def get_tool_definitions(
|
|||||||
|
|
||||||
result = _compute_tool_definitions(enabled_toolsets, disabled_toolsets, quiet_mode)
|
result = _compute_tool_definitions(enabled_toolsets, disabled_toolsets, quiet_mode)
|
||||||
if quiet_mode:
|
if quiet_mode:
|
||||||
|
# Cache the freshly-computed list, but hand callers a shallow copy so
|
||||||
|
# downstream mutations (e.g. run_agent appending memory/LCM tool
|
||||||
|
# schemas to self.tools) don't poison the cache. Without this, a
|
||||||
|
# long-lived Gateway process accumulates duplicate tool names across
|
||||||
|
# agent inits and providers that enforce unique tool names
|
||||||
|
# (DeepSeek, Xiaomi MiMo, Moonshot Kimi) reject the request with
|
||||||
|
# HTTP 400. Mirrors the cache-hit path above. (issue #17335)
|
||||||
_tool_defs_cache[cache_key] = result
|
_tool_defs_cache[cache_key] = result
|
||||||
|
return list(result)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
19
run_agent.py
19
run_agent.py
@ -2001,16 +2001,31 @@ class AIAgent:
|
|||||||
f"model.context_length in config.yaml to override."
|
f"model.context_length in config.yaml to override."
|
||||||
)
|
)
|
||||||
|
|
||||||
# Inject context engine tool schemas (e.g. lcm_grep, lcm_describe, lcm_expand)
|
# Inject context engine tool schemas (e.g. lcm_grep, lcm_describe, lcm_expand).
|
||||||
|
# Skip names that are already present — the get_tool_definitions()
|
||||||
|
# quiet_mode cache returned a shared list pre-#17335, so a stray
|
||||||
|
# mutation here would poison subsequent agent inits in the same
|
||||||
|
# Gateway process and trip provider-side 'duplicate tool name'
|
||||||
|
# errors. Even with the cache fix, dedup is the right defense
|
||||||
|
# against plugin paths that may register the same schemas via
|
||||||
|
# ctx.register_tool(). Mirrors the memory tools dedup above.
|
||||||
self._context_engine_tool_names: set = set()
|
self._context_engine_tool_names: set = set()
|
||||||
if hasattr(self, "context_compressor") and self.context_compressor and self.tools is not None:
|
if hasattr(self, "context_compressor") and self.context_compressor and self.tools is not None:
|
||||||
|
_existing_tool_names = {
|
||||||
|
t.get("function", {}).get("name")
|
||||||
|
for t in self.tools
|
||||||
|
if isinstance(t, dict)
|
||||||
|
}
|
||||||
for _schema in self.context_compressor.get_tool_schemas():
|
for _schema in self.context_compressor.get_tool_schemas():
|
||||||
|
_tname = _schema.get("name", "")
|
||||||
|
if _tname and _tname in _existing_tool_names:
|
||||||
|
continue # already registered via plugin/cache path
|
||||||
_wrapped = {"type": "function", "function": _schema}
|
_wrapped = {"type": "function", "function": _schema}
|
||||||
self.tools.append(_wrapped)
|
self.tools.append(_wrapped)
|
||||||
_tname = _schema.get("name", "")
|
|
||||||
if _tname:
|
if _tname:
|
||||||
self.valid_tool_names.add(_tname)
|
self.valid_tool_names.add(_tname)
|
||||||
self._context_engine_tool_names.add(_tname)
|
self._context_engine_tool_names.add(_tname)
|
||||||
|
_existing_tool_names.add(_tname)
|
||||||
|
|
||||||
# Notify context engine of session start
|
# Notify context engine of session start
|
||||||
if hasattr(self, "context_compressor") and self.context_compressor:
|
if hasattr(self, "context_compressor") and self.context_compressor:
|
||||||
|
|||||||
94
tests/test_get_tool_definitions_cache_isolation.py
Normal file
94
tests/test_get_tool_definitions_cache_isolation.py
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
"""Regression tests for issue #17335.
|
||||||
|
|
||||||
|
The ``quiet_mode=True`` fast path in :func:`model_tools.get_tool_definitions`
|
||||||
|
memoizes results to avoid re-walking the registry on every Gateway call. The
|
||||||
|
cached object must NOT be aliased into callers' return values \u2014 long-lived
|
||||||
|
Gateway processes mutate the returned list (``run_agent`` appends memory and
|
||||||
|
LCM context-engine tool schemas to ``self.tools``), and a shared list would
|
||||||
|
poison subsequent agent inits with duplicate tool names. Providers that
|
||||||
|
enforce uniqueness (DeepSeek, Xiaomi MiMo, Moonshot/Kimi) then reject the
|
||||||
|
API call with HTTP 400.
|
||||||
|
|
||||||
|
These tests pin:
|
||||||
|
- the cache-hit path returns a fresh list (existing #17098 behavior)
|
||||||
|
- the first uncached call also returns a fresh list (the fix)
|
||||||
|
- every call returns a list that is not the cached one, even after mutation
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
import model_tools
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def _clear_cache():
|
||||||
|
"""Each test starts with an empty quiet_mode cache."""
|
||||||
|
model_tools._tool_defs_cache.clear()
|
||||||
|
yield
|
||||||
|
model_tools._tool_defs_cache.clear()
|
||||||
|
|
||||||
|
|
||||||
|
class TestQuietModeCacheIsolation:
|
||||||
|
|
||||||
|
def test_first_uncached_call_returns_fresh_list(self):
|
||||||
|
"""The first quiet_mode call must not alias the cached object \u2014
|
||||||
|
otherwise a caller mutating the returned list mutates the cache."""
|
||||||
|
first = model_tools.get_tool_definitions(quiet_mode=True)
|
||||||
|
assert isinstance(first, list)
|
||||||
|
# Find the cached value to compare identity.
|
||||||
|
assert len(model_tools._tool_defs_cache) == 1
|
||||||
|
cached = next(iter(model_tools._tool_defs_cache.values()))
|
||||||
|
assert first is not cached, (
|
||||||
|
"issue #17335: first quiet_mode call returned the cached list "
|
||||||
|
"by reference \u2014 mutations will leak into subsequent calls."
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_cache_hit_returns_fresh_list(self):
|
||||||
|
"""The cache-hit path already returned a copy pre-fix; pin it."""
|
||||||
|
first = model_tools.get_tool_definitions(quiet_mode=True)
|
||||||
|
second = model_tools.get_tool_definitions(quiet_mode=True)
|
||||||
|
assert first is not second
|
||||||
|
cached = next(iter(model_tools._tool_defs_cache.values()))
|
||||||
|
assert second is not cached
|
||||||
|
|
||||||
|
def test_caller_mutation_does_not_poison_cache(self):
|
||||||
|
"""Simulate run_agent appending LCM tool schemas to the returned
|
||||||
|
list. A second call must NOT see those appended entries."""
|
||||||
|
first = model_tools.get_tool_definitions(quiet_mode=True)
|
||||||
|
baseline_len = len(first)
|
||||||
|
# Caller mutates the returned list (this is what run_agent does
|
||||||
|
# when it injects memory + context-engine tool schemas).
|
||||||
|
first.append({"type": "function", "function": {"name": "lcm_grep"}})
|
||||||
|
first.append({"type": "function", "function": {"name": "lcm_expand"}})
|
||||||
|
|
||||||
|
second = model_tools.get_tool_definitions(quiet_mode=True)
|
||||||
|
# Length must match the original \u2014 cache pollution would make
|
||||||
|
# second 2 entries longer.
|
||||||
|
assert len(second) == baseline_len, (
|
||||||
|
f"issue #17335: cache was polluted by caller mutation. "
|
||||||
|
f"first len={baseline_len}, mutated len={len(first)}, "
|
||||||
|
f"second-call len={len(second)} \u2014 expected {baseline_len}."
|
||||||
|
)
|
||||||
|
names = [t.get("function", {}).get("name") for t in second]
|
||||||
|
assert "lcm_grep" not in names
|
||||||
|
assert "lcm_expand" not in names
|
||||||
|
|
||||||
|
def test_repeated_caller_mutation_does_not_accumulate(self):
|
||||||
|
"""The original Gateway symptom: every agent init in a long-lived
|
||||||
|
process appends LCM schemas, accumulating duplicates over time."""
|
||||||
|
baseline = len(model_tools.get_tool_definitions(quiet_mode=True))
|
||||||
|
for _ in range(5):
|
||||||
|
tools = model_tools.get_tool_definitions(quiet_mode=True)
|
||||||
|
tools.append({"type": "function", "function": {"name": "lcm_grep"}})
|
||||||
|
final = model_tools.get_tool_definitions(quiet_mode=True)
|
||||||
|
assert len(final) == baseline, (
|
||||||
|
f"Cache accumulated mutations across {5} agent inits: "
|
||||||
|
f"baseline={baseline}, final={len(final)}."
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_non_quiet_mode_does_not_use_cache(self):
|
||||||
|
"""Sanity: quiet_mode=False (TUI path) skips the cache entirely \u2014
|
||||||
|
explains why the bug only hit Gateway."""
|
||||||
|
model_tools.get_tool_definitions(quiet_mode=False)
|
||||||
|
assert len(model_tools._tool_defs_cache) == 0
|
||||||
Reference in New Issue
Block a user