* fix(auxiliary): stop capping output with max_tokens by default Auxiliary LLM calls (compression, titles, vision, etc.) no longer send max_tokens on the OpenAI-compatible chat-completions path. Most providers treat an omitted max_tokens as "use the model max", which is what we want; an explicit cap only risks truncation or a wire-format 400. This was surfaced by GitHub Copilot / GPT-5 (#34530): those models reject max_tokens and require max_completion_tokens, so compression 400'd and fell back to a static context marker. Omitting the param sidesteps that quirk (and ZAI vision's error 1210) entirely. The Anthropic Messages wire (MiniMax + /anthropic endpoints) keeps max_tokens because it is a mandatory field there. * test(auxiliary): update temperature-retry assertions for omitted max_tokens The temperature-retry tests asserted retry_kwargs["max_tokens"] == 500 on an api.openai.com endpoint. Now that auxiliary calls omit max_tokens on OpenAI-compatible endpoints (#34530), that key is absent. Assert it's absent in both first and retry kwargs and use model as the survives-the-retry witness.
This commit is contained in:
@ -4725,24 +4725,23 @@ def _build_call_kwargs(
|
||||
kwargs["temperature"] = temperature
|
||||
|
||||
if max_tokens is not None:
|
||||
# Codex adapter handles max_tokens internally; OpenRouter/Nous use max_tokens.
|
||||
# Direct OpenAI api.openai.com with newer models needs max_completion_tokens.
|
||||
# ZAI vision models (glm-4v-flash, glm-4v-plus, etc.) reject max_tokens with
|
||||
# error code 1210 ("API 调用参数有误") on multimodal requests — skip it.
|
||||
_model_lower = (model or "").lower()
|
||||
_skip_max_tokens = (
|
||||
provider == "zai"
|
||||
and ("4v" in _model_lower or "5v" in _model_lower or "-v" in _model_lower)
|
||||
# We do NOT cap output by default. Most chat-completions providers treat
|
||||
# an omitted max_tokens as "use the model's max output", which is what we
|
||||
# want for auxiliary tasks (compression summaries, titles, vision, etc.) —
|
||||
# an explicit cap only risks truncating a summary or 400-ing on providers
|
||||
# that reject the parameter outright (e.g. GitHub Copilot / newer OpenAI
|
||||
# GPT-5 models require max_completion_tokens, not max_tokens; ZAI vision
|
||||
# models reject it entirely with error 1210). Omitting it sidesteps all of
|
||||
# those wire-format quirks at once.
|
||||
#
|
||||
# The one exception is the Anthropic Messages wire (MiniMax and any
|
||||
# ``/anthropic`` endpoint reached through the OpenAI SDK wrapper), where
|
||||
# max_tokens is a MANDATORY field — omitting it is a hard 400. Keep it only
|
||||
# there.
|
||||
_effective_base = base_url or (
|
||||
_current_custom_base_url() if provider == "custom" else ""
|
||||
)
|
||||
if _skip_max_tokens:
|
||||
pass # ZAI vision models do not accept max_tokens
|
||||
elif provider == "custom":
|
||||
custom_base = base_url or _current_custom_base_url()
|
||||
if base_url_hostname(custom_base) == "api.openai.com":
|
||||
kwargs["max_completion_tokens"] = max_tokens
|
||||
else:
|
||||
kwargs["max_tokens"] = max_tokens
|
||||
else:
|
||||
if _is_anthropic_compat_endpoint(provider, _effective_base):
|
||||
kwargs["max_tokens"] = max_tokens
|
||||
|
||||
if tools:
|
||||
|
||||
@ -88,6 +88,62 @@ class TestAuxiliaryMaxTokensParam:
|
||||
assert auxiliary_max_tokens_param(2048) == {"max_completion_tokens": 2048}
|
||||
|
||||
|
||||
class TestBuildCallKwargsMaxTokens:
|
||||
"""_build_call_kwargs should not cap output by default (#34530).
|
||||
|
||||
Most chat-completions providers treat an omitted max_tokens as "use the
|
||||
model max", which is what we want for auxiliary tasks. An explicit cap only
|
||||
risks truncation or a wire-format 400 (GitHub Copilot / GPT-5 reject
|
||||
max_tokens; ZAI vision rejects it entirely). The Anthropic Messages wire is
|
||||
the one exception — max_tokens is a mandatory field there.
|
||||
"""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"provider,model,base_url",
|
||||
[
|
||||
("copilot", "gpt-5.4", "https://api.githubcopilot.com"),
|
||||
("copilot", "gpt-5.5", "https://api.githubcopilot.com"),
|
||||
("custom", "gpt-5", "https://api.openai.com/v1"),
|
||||
("openrouter", "anthropic/claude-sonnet-4.6", "https://openrouter.ai/api/v1"),
|
||||
("nous", "hermes-4", "https://inference-api.nousresearch.com/v1"),
|
||||
("custom", "qwen", "http://localhost:8080/v1"),
|
||||
("zai", "glm-4v-flash", "https://open.bigmodel.cn/api/paas/v4"),
|
||||
],
|
||||
)
|
||||
def test_omits_max_tokens_for_openai_compatible(self, provider, model, base_url):
|
||||
from agent.auxiliary_client import _build_call_kwargs
|
||||
|
||||
kwargs = _build_call_kwargs(
|
||||
provider=provider,
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": "hi"}],
|
||||
max_tokens=1234,
|
||||
base_url=base_url,
|
||||
)
|
||||
assert "max_tokens" not in kwargs
|
||||
assert "max_completion_tokens" not in kwargs
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"provider,model,base_url",
|
||||
[
|
||||
("minimax", "minimax-m2", "https://api.minimax.io/v1"),
|
||||
("custom", "claude", "https://proxy.example.com/anthropic/v1"),
|
||||
],
|
||||
)
|
||||
def test_keeps_max_tokens_on_anthropic_wire(self, provider, model, base_url):
|
||||
from agent.auxiliary_client import _build_call_kwargs
|
||||
|
||||
kwargs = _build_call_kwargs(
|
||||
provider=provider,
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": "hi"}],
|
||||
max_tokens=1234,
|
||||
base_url=base_url,
|
||||
)
|
||||
assert kwargs["max_tokens"] == 1234
|
||||
assert "max_completion_tokens" not in kwargs
|
||||
|
||||
|
||||
class TestNormalizeAuxProvider:
|
||||
def test_maps_github_copilot_aliases(self):
|
||||
assert _normalize_aux_provider("github") == "copilot"
|
||||
|
||||
@ -112,8 +112,13 @@ class TestCallLlmUnsupportedTemperatureRetry:
|
||||
retry_kwargs = client.chat.completions.create.call_args_list[1].kwargs
|
||||
assert first_kwargs["temperature"] == 0.3
|
||||
assert "temperature" not in retry_kwargs
|
||||
# other kwargs preserved
|
||||
assert retry_kwargs["max_tokens"] == 500
|
||||
# max_tokens is intentionally omitted on OpenAI-compatible endpoints
|
||||
# (#34530) — auxiliary calls let the model max out its own output — so
|
||||
# it must be absent in BOTH the first and retry kwargs. Use a kwarg that
|
||||
# actually survives (model) to prove the retry preserves the rest.
|
||||
assert "max_tokens" not in first_kwargs
|
||||
assert "max_tokens" not in retry_kwargs
|
||||
assert retry_kwargs["model"] == first_kwargs["model"]
|
||||
|
||||
def test_non_temperature_400_does_not_retry_as_temperature(self):
|
||||
"""Unrelated 400s (e.g. bad tool role) must not silently drop temp."""
|
||||
@ -207,7 +212,11 @@ class TestAsyncCallLlmUnsupportedTemperatureRetry:
|
||||
retry_kwargs = client.chat.completions.create.call_args_list[1].kwargs
|
||||
assert first_kwargs["temperature"] == 0.3
|
||||
assert "temperature" not in retry_kwargs
|
||||
assert retry_kwargs["max_tokens"] == 500
|
||||
# max_tokens is intentionally omitted on OpenAI-compatible endpoints
|
||||
# (#34530); assert it's absent and that model survives the retry.
|
||||
assert "max_tokens" not in first_kwargs
|
||||
assert "max_tokens" not in retry_kwargs
|
||||
assert retry_kwargs["model"] == first_kwargs["model"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_non_temperature_400_does_not_retry(self):
|
||||
|
||||
Reference in New Issue
Block a user