Files
hermes-agent/tests/agent/test_unsupported_temperature_retry.py
Teknium 2062a84000 fix(auxiliary): stop capping output with max_tokens by default (#34530) (#34845)
* fix(auxiliary): stop capping output with max_tokens by default

Auxiliary LLM calls (compression, titles, vision, etc.) no longer send
max_tokens on the OpenAI-compatible chat-completions path. Most providers
treat an omitted max_tokens as "use the model max", which is what we want;
an explicit cap only risks truncation or a wire-format 400.

This was surfaced by GitHub Copilot / GPT-5 (#34530): those models reject
max_tokens and require max_completion_tokens, so compression 400'd and fell
back to a static context marker. Omitting the param sidesteps that quirk
(and ZAI vision's error 1210) entirely.

The Anthropic Messages wire (MiniMax + /anthropic endpoints) keeps
max_tokens because it is a mandatory field there.

* test(auxiliary): update temperature-retry assertions for omitted max_tokens

The temperature-retry tests asserted retry_kwargs["max_tokens"] == 500 on an
api.openai.com endpoint. Now that auxiliary calls omit max_tokens on
OpenAI-compatible endpoints (#34530), that key is absent. Assert it's absent
in both first and retry kwargs and use model as the survives-the-retry witness.
2026-05-29 17:24:30 -07:00

247 lines
11 KiB
Python

"""Regression tests for the universal "unsupported temperature" retry in
``agent.auxiliary_client``.
Auxiliary callers (context compression, session search,
web extract summarisation, etc.) hardcode ``temperature=0.3`` for historical
reasons. Several provider/model combinations reject ``temperature`` with a
400:
* OpenAI Responses (gpt-5/o-series reasoning models)
* Copilot Responses (reasoning models)
* OpenRouter reasoning models (gpt-5.5, some anthropic via OAI-compat)
* Anthropic Opus 4.7+ via OpenAI-compat endpoints
* Kimi/Moonshot (server-managed)
``_fixed_temperature_for_model`` catches Kimi up front, and
``build_chat_completion_kwargs`` drops temperature for Anthropic Opus 4.7+,
but the same backend can accept ``temperature`` for some models and reject
it for others (for example gpt-5.4 accepts but gpt-5.5 rejects on the same
endpoint). An allow/deny-list is not maintainable across providers.
The universal fix is reactive: when a call returns an
``Unsupported parameter: temperature`` 400, retry once without temperature.
These tests lock in that behaviour for both sync and async paths.
"""
from unittest.mock import patch, MagicMock, AsyncMock
import pytest
from agent.auxiliary_client import (
call_llm,
async_call_llm,
_is_unsupported_temperature_error,
)
class TestIsUnsupportedTemperatureError:
"""The detector must match the phrasings providers actually return."""
@pytest.mark.parametrize("message", [
# OpenAI / Codex Responses
"HTTP 400: Unsupported parameter: temperature",
"Error code: 400 - {'error': {'message': \"Unsupported parameter: 'temperature'\"}}",
# Copilot / OpenAI error-code form
"Error code: 400 - {'error': {'code': 'unsupported_parameter', 'param': 'temperature'}}",
# OpenRouter-style
"Provider returned error: temperature is not supported for this model",
"this model does not support temperature",
# Anthropic-style via OAI-compat
"temperature: unknown parameter",
# Some gateways
"unrecognized request argument supplied: temperature",
])
def test_matches_real_provider_messages(self, message):
assert _is_unsupported_temperature_error(RuntimeError(message)) is True
@pytest.mark.parametrize("message", [
# Unrelated 400s must NOT trigger a silent-retry
"HTTP 400: Invalid value: 'tool'. Supported values are: 'assistant'...",
"max_tokens is too large for this model",
"Rate limit exceeded",
"Connection reset by peer",
# Temperature value error is a different class of problem
"temperature must be between 0 and 2",
])
def test_does_not_match_unrelated_errors(self, message):
assert _is_unsupported_temperature_error(RuntimeError(message)) is False
def _dummy_response():
# The real code calls _validate_llm_response which inspects
# response.choices[0].message. The tests here patch that out, so
# any sentinel object is fine.
return {"ok": True}
class TestCallLlmUnsupportedTemperatureRetry:
"""``call_llm`` retries once without temperature and returns on success."""
def _setup(self, first_exc):
client = MagicMock()
client.base_url = "https://api.openai.com/v1"
client.chat.completions.create.side_effect = [first_exc, _dummy_response()]
return client
@pytest.mark.parametrize("error_message", [
"HTTP 400: Unsupported parameter: temperature",
"Error code: 400 - {'error': {'code': 'unsupported_parameter', 'param': 'temperature'}}",
"Provider error: this model does not support temperature",
])
def test_retries_once_without_temperature(self, error_message):
client = self._setup(RuntimeError(error_message))
with (
patch("agent.auxiliary_client._resolve_task_provider_model",
return_value=("openai-codex", "gpt-5.5", None, None, None)),
patch("agent.auxiliary_client._get_cached_client",
return_value=(client, "gpt-5.5")),
patch("agent.auxiliary_client._validate_llm_response",
side_effect=lambda resp, _task: resp),
):
result = call_llm(
task="compression",
messages=[{"role": "user", "content": "remember this"}],
temperature=0.3,
max_tokens=500,
)
assert result == {"ok": True}
assert client.chat.completions.create.call_count == 2
first_kwargs = client.chat.completions.create.call_args_list[0].kwargs
retry_kwargs = client.chat.completions.create.call_args_list[1].kwargs
assert first_kwargs["temperature"] == 0.3
assert "temperature" not in retry_kwargs
# max_tokens is intentionally omitted on OpenAI-compatible endpoints
# (#34530) — auxiliary calls let the model max out its own output — so
# it must be absent in BOTH the first and retry kwargs. Use a kwarg that
# actually survives (model) to prove the retry preserves the rest.
assert "max_tokens" not in first_kwargs
assert "max_tokens" not in retry_kwargs
assert retry_kwargs["model"] == first_kwargs["model"]
def test_non_temperature_400_does_not_retry_as_temperature(self):
"""Unrelated 400s (e.g. bad tool role) must not silently drop temp."""
client = MagicMock()
client.base_url = "https://api.openai.com/v1"
non_temp_err = RuntimeError(
"HTTP 400: Invalid value: 'tool'. Supported values are: 'assistant'..."
)
client.chat.completions.create.side_effect = non_temp_err
with (
patch("agent.auxiliary_client._resolve_task_provider_model",
return_value=("openai-codex", "gpt-5.5", None, None, None)),
patch("agent.auxiliary_client._get_cached_client",
return_value=(client, "gpt-5.5")),
patch("agent.auxiliary_client._validate_llm_response",
side_effect=lambda resp, _task: resp),
patch("agent.auxiliary_client._try_payment_fallback",
return_value=None),
):
with pytest.raises(RuntimeError, match="Invalid value"):
call_llm(
task="compression",
messages=[{"role": "user", "content": "x"}],
temperature=0.3,
max_tokens=500,
)
# Should NOT have retried (non-temperature 400 doesn't match)
assert client.chat.completions.create.call_count == 1
def test_no_retry_when_temperature_not_in_kwargs(self):
"""If caller didn't send temperature, don't invent a temperature-retry."""
client = MagicMock()
client.base_url = "https://api.openai.com/v1"
# Provider complains about temperature even though we didn't send it.
# (Pathological but possible with misleading error text.) The guard
# ``"temperature" in kwargs`` must prevent an unnecessary retry.
err = RuntimeError("HTTP 400: Unsupported parameter: temperature")
client.chat.completions.create.side_effect = err
with (
patch("agent.auxiliary_client._resolve_task_provider_model",
return_value=("openai-codex", "gpt-5.5", None, None, None)),
patch("agent.auxiliary_client._get_cached_client",
return_value=(client, "gpt-5.5")),
patch("agent.auxiliary_client._validate_llm_response",
side_effect=lambda resp, _task: resp),
patch("agent.auxiliary_client._try_payment_fallback",
return_value=None),
):
with pytest.raises(RuntimeError):
call_llm(
task="compression",
messages=[{"role": "user", "content": "x"}],
temperature=None, # explicit: no temperature sent
max_tokens=500,
)
assert client.chat.completions.create.call_count == 1
class TestAsyncCallLlmUnsupportedTemperatureRetry:
"""``async_call_llm`` mirror of the sync retry semantics."""
@pytest.mark.asyncio
async def test_async_retries_once_without_temperature(self):
client = MagicMock()
client.base_url = "https://api.openai.com/v1"
client.chat.completions.create = AsyncMock(side_effect=[
RuntimeError("HTTP 400: Unsupported parameter: temperature"),
_dummy_response(),
])
with (
patch("agent.auxiliary_client._resolve_task_provider_model",
return_value=("openai-codex", "gpt-5.5", None, None, None)),
patch("agent.auxiliary_client._get_cached_client",
return_value=(client, "gpt-5.5")),
patch("agent.auxiliary_client._validate_llm_response",
side_effect=lambda resp, _task: resp),
):
result = await async_call_llm(
task="session_search",
messages=[{"role": "user", "content": "query"}],
temperature=0.3,
max_tokens=500,
)
assert result == {"ok": True}
assert client.chat.completions.create.await_count == 2
first_kwargs = client.chat.completions.create.call_args_list[0].kwargs
retry_kwargs = client.chat.completions.create.call_args_list[1].kwargs
assert first_kwargs["temperature"] == 0.3
assert "temperature" not in retry_kwargs
# max_tokens is intentionally omitted on OpenAI-compatible endpoints
# (#34530); assert it's absent and that model survives the retry.
assert "max_tokens" not in first_kwargs
assert "max_tokens" not in retry_kwargs
assert retry_kwargs["model"] == first_kwargs["model"]
@pytest.mark.asyncio
async def test_async_non_temperature_400_does_not_retry(self):
client = MagicMock()
client.base_url = "https://api.openai.com/v1"
client.chat.completions.create = AsyncMock(
side_effect=RuntimeError("HTTP 400: Invalid value: 'tool'"),
)
with (
patch("agent.auxiliary_client._resolve_task_provider_model",
return_value=("openai-codex", "gpt-5.5", None, None, None)),
patch("agent.auxiliary_client._get_cached_client",
return_value=(client, "gpt-5.5")),
patch("agent.auxiliary_client._validate_llm_response",
side_effect=lambda resp, _task: resp),
patch("agent.auxiliary_client._try_payment_fallback",
return_value=None),
):
with pytest.raises(RuntimeError, match="Invalid value"):
await async_call_llm(
task="session_search",
messages=[{"role": "user", "content": "x"}],
temperature=0.3,
max_tokens=500,
)
assert client.chat.completions.create.await_count == 1