diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index 44b11dfaa..c5fd9a20a 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -4725,24 +4725,23 @@ def _build_call_kwargs( kwargs["temperature"] = temperature if max_tokens is not None: - # Codex adapter handles max_tokens internally; OpenRouter/Nous use max_tokens. - # Direct OpenAI api.openai.com with newer models needs max_completion_tokens. - # ZAI vision models (glm-4v-flash, glm-4v-plus, etc.) reject max_tokens with - # error code 1210 ("API 调用参数有误") on multimodal requests — skip it. - _model_lower = (model or "").lower() - _skip_max_tokens = ( - provider == "zai" - and ("4v" in _model_lower or "5v" in _model_lower or "-v" in _model_lower) + # We do NOT cap output by default. Most chat-completions providers treat + # an omitted max_tokens as "use the model's max output", which is what we + # want for auxiliary tasks (compression summaries, titles, vision, etc.) — + # an explicit cap only risks truncating a summary or 400-ing on providers + # that reject the parameter outright (e.g. GitHub Copilot / newer OpenAI + # GPT-5 models require max_completion_tokens, not max_tokens; ZAI vision + # models reject it entirely with error 1210). Omitting it sidesteps all of + # those wire-format quirks at once. + # + # The one exception is the Anthropic Messages wire (MiniMax and any + # ``/anthropic`` endpoint reached through the OpenAI SDK wrapper), where + # max_tokens is a MANDATORY field — omitting it is a hard 400. Keep it only + # there. + _effective_base = base_url or ( + _current_custom_base_url() if provider == "custom" else "" ) - if _skip_max_tokens: - pass # ZAI vision models do not accept max_tokens - elif provider == "custom": - custom_base = base_url or _current_custom_base_url() - if base_url_hostname(custom_base) == "api.openai.com": - kwargs["max_completion_tokens"] = max_tokens - else: - kwargs["max_tokens"] = max_tokens - else: + if _is_anthropic_compat_endpoint(provider, _effective_base): kwargs["max_tokens"] = max_tokens if tools: diff --git a/tests/agent/test_auxiliary_client.py b/tests/agent/test_auxiliary_client.py index 66e52b6c1..97c3a7f6b 100644 --- a/tests/agent/test_auxiliary_client.py +++ b/tests/agent/test_auxiliary_client.py @@ -88,6 +88,62 @@ class TestAuxiliaryMaxTokensParam: assert auxiliary_max_tokens_param(2048) == {"max_completion_tokens": 2048} +class TestBuildCallKwargsMaxTokens: + """_build_call_kwargs should not cap output by default (#34530). + + Most chat-completions providers treat an omitted max_tokens as "use the + model max", which is what we want for auxiliary tasks. An explicit cap only + risks truncation or a wire-format 400 (GitHub Copilot / GPT-5 reject + max_tokens; ZAI vision rejects it entirely). The Anthropic Messages wire is + the one exception — max_tokens is a mandatory field there. + """ + + @pytest.mark.parametrize( + "provider,model,base_url", + [ + ("copilot", "gpt-5.4", "https://api.githubcopilot.com"), + ("copilot", "gpt-5.5", "https://api.githubcopilot.com"), + ("custom", "gpt-5", "https://api.openai.com/v1"), + ("openrouter", "anthropic/claude-sonnet-4.6", "https://openrouter.ai/api/v1"), + ("nous", "hermes-4", "https://inference-api.nousresearch.com/v1"), + ("custom", "qwen", "http://localhost:8080/v1"), + ("zai", "glm-4v-flash", "https://open.bigmodel.cn/api/paas/v4"), + ], + ) + def test_omits_max_tokens_for_openai_compatible(self, provider, model, base_url): + from agent.auxiliary_client import _build_call_kwargs + + kwargs = _build_call_kwargs( + provider=provider, + model=model, + messages=[{"role": "user", "content": "hi"}], + max_tokens=1234, + base_url=base_url, + ) + assert "max_tokens" not in kwargs + assert "max_completion_tokens" not in kwargs + + @pytest.mark.parametrize( + "provider,model,base_url", + [ + ("minimax", "minimax-m2", "https://api.minimax.io/v1"), + ("custom", "claude", "https://proxy.example.com/anthropic/v1"), + ], + ) + def test_keeps_max_tokens_on_anthropic_wire(self, provider, model, base_url): + from agent.auxiliary_client import _build_call_kwargs + + kwargs = _build_call_kwargs( + provider=provider, + model=model, + messages=[{"role": "user", "content": "hi"}], + max_tokens=1234, + base_url=base_url, + ) + assert kwargs["max_tokens"] == 1234 + assert "max_completion_tokens" not in kwargs + + class TestNormalizeAuxProvider: def test_maps_github_copilot_aliases(self): assert _normalize_aux_provider("github") == "copilot" diff --git a/tests/agent/test_unsupported_temperature_retry.py b/tests/agent/test_unsupported_temperature_retry.py index 82d8d3208..4d2ebb980 100644 --- a/tests/agent/test_unsupported_temperature_retry.py +++ b/tests/agent/test_unsupported_temperature_retry.py @@ -112,8 +112,13 @@ class TestCallLlmUnsupportedTemperatureRetry: retry_kwargs = client.chat.completions.create.call_args_list[1].kwargs assert first_kwargs["temperature"] == 0.3 assert "temperature" not in retry_kwargs - # other kwargs preserved - assert retry_kwargs["max_tokens"] == 500 + # max_tokens is intentionally omitted on OpenAI-compatible endpoints + # (#34530) — auxiliary calls let the model max out its own output — so + # it must be absent in BOTH the first and retry kwargs. Use a kwarg that + # actually survives (model) to prove the retry preserves the rest. + assert "max_tokens" not in first_kwargs + assert "max_tokens" not in retry_kwargs + assert retry_kwargs["model"] == first_kwargs["model"] def test_non_temperature_400_does_not_retry_as_temperature(self): """Unrelated 400s (e.g. bad tool role) must not silently drop temp.""" @@ -207,7 +212,11 @@ class TestAsyncCallLlmUnsupportedTemperatureRetry: retry_kwargs = client.chat.completions.create.call_args_list[1].kwargs assert first_kwargs["temperature"] == 0.3 assert "temperature" not in retry_kwargs - assert retry_kwargs["max_tokens"] == 500 + # max_tokens is intentionally omitted on OpenAI-compatible endpoints + # (#34530); assert it's absent and that model survives the retry. + assert "max_tokens" not in first_kwargs + assert "max_tokens" not in retry_kwargs + assert retry_kwargs["model"] == first_kwargs["model"] @pytest.mark.asyncio async def test_async_non_temperature_400_does_not_retry(self):