fix(kimi): send max_tokens, reasoning_effort, and thinking for Kimi/Moonshot
Kimi/Moonshot endpoints require explicit parameters that Hermes was not
sending, causing 'Response truncated due to output length limit' errors
and inconsistent reasoning behavior.
Root cause analysis against Kimi CLI source (MoonshotAI/kimi-cli,
packages/kosong/src/kosong/chat_provider/kimi.py):
1. max_tokens: Kimi's API defaults to a very low value when omitted.
Reasoning tokens share the output budget — the model exhausts it on
thinking alone. Send 32000, matching Kimi CLI's generate() default.
2. reasoning_effort: Kimi CLI sends this as a top-level parameter (not
inside extra_body). Hermes was not sending it at all because
_supports_reasoning_extra_body() returns False for non-OpenRouter
endpoints.
3. extra_body.thinking: Kimi CLI uses with_thinking() which sets
extra_body.thinking={"type":"enabled"} alongside reasoning_effort.
This is a separate control from the OpenAI-style reasoning extra_body
that Hermes sends for OpenRouter/GitHub. Without it, the Kimi gateway
may not activate reasoning mode correctly.
Covers api.kimi.com (Kimi Code) and api.moonshot.ai/cn (Moonshot).
Tests: 6 new test cases for max_tokens, reasoning_effort, and
extra_body.thinking under various configs.
This commit is contained in:
46
run_agent.py
46
run_agent.py
@ -6909,6 +6909,34 @@ class AIAgent:
|
||||
# (the documented max output for qwen3-coder models) so the
|
||||
# model has adequate output budget for tool calls.
|
||||
api_kwargs.update(self._max_tokens_param(65536))
|
||||
elif (
|
||||
base_url_host_matches(self.base_url, "api.kimi.com")
|
||||
or base_url_host_matches(self.base_url, "moonshot.ai")
|
||||
or base_url_host_matches(self.base_url, "moonshot.cn")
|
||||
):
|
||||
# Kimi/Moonshot defaults to a low max_tokens when omitted.
|
||||
# Reasoning tokens share the output budget — without an explicit
|
||||
# value the model can exhaust it on thinking alone, causing
|
||||
# "Response truncated due to output length limit". 32000 matches
|
||||
# Kimi CLI's default (see MoonshotAI/kimi-cli kimi.py generate()).
|
||||
api_kwargs.update(self._max_tokens_param(32000))
|
||||
# Kimi requires reasoning_effort as a top-level chat completions
|
||||
# parameter (not inside extra_body). Mirror Kimi CLI's
|
||||
# with_generation_kwargs(reasoning_effort=...) / with_thinking():
|
||||
# when thinking is disabled, Kimi CLI omits reasoning_effort
|
||||
# entirely (maps to None).
|
||||
_kimi_thinking_off = bool(
|
||||
self.reasoning_config
|
||||
and isinstance(self.reasoning_config, dict)
|
||||
and self.reasoning_config.get("enabled") is False
|
||||
)
|
||||
if not _kimi_thinking_off:
|
||||
_kimi_effort = "medium"
|
||||
if self.reasoning_config and isinstance(self.reasoning_config, dict):
|
||||
_e = (self.reasoning_config.get("effort") or "").strip().lower()
|
||||
if _e in ("low", "medium", "high"):
|
||||
_kimi_effort = _e
|
||||
api_kwargs["reasoning_effort"] = _kimi_effort
|
||||
elif (self._is_openrouter_url() or "nousresearch" in self._base_url_lower) and "claude" in (self.model or "").lower():
|
||||
# OpenRouter and Nous Portal translate requests to Anthropic's
|
||||
# Messages API, which requires max_tokens as a mandatory field.
|
||||
@ -6940,6 +6968,24 @@ class AIAgent:
|
||||
extra_body["provider"] = provider_preferences
|
||||
_is_nous = "nousresearch" in self._base_url_lower
|
||||
|
||||
# Kimi/Moonshot API uses extra_body.thinking (separate from the
|
||||
# top-level reasoning_effort) to enable/disable reasoning mode.
|
||||
# Mirror Kimi CLI's with_thinking() behavior exactly — see
|
||||
# MoonshotAI/kimi-cli packages/kosong/src/kosong/chat_provider/kimi.py
|
||||
_is_kimi = (
|
||||
base_url_host_matches(self.base_url, "api.kimi.com")
|
||||
or base_url_host_matches(self.base_url, "moonshot.ai")
|
||||
or base_url_host_matches(self.base_url, "moonshot.cn")
|
||||
)
|
||||
if _is_kimi:
|
||||
_kimi_thinking_enabled = True
|
||||
if self.reasoning_config and isinstance(self.reasoning_config, dict):
|
||||
if self.reasoning_config.get("enabled") is False:
|
||||
_kimi_thinking_enabled = False
|
||||
extra_body["thinking"] = {
|
||||
"type": "enabled" if _kimi_thinking_enabled else "disabled",
|
||||
}
|
||||
|
||||
if self._supports_reasoning_extra_body():
|
||||
if _is_github_models:
|
||||
github_reasoning = self._github_models_reasoning_extra_body()
|
||||
|
||||
@ -952,6 +952,84 @@ class TestBuildApiKwargs:
|
||||
|
||||
assert "temperature" not in kwargs
|
||||
|
||||
def test_kimi_coding_endpoint_sends_max_tokens_and_reasoning(self, agent):
|
||||
"""Kimi endpoint should send max_tokens=32000 and reasoning_effort as
|
||||
top-level params, matching Kimi CLI's default behavior."""
|
||||
agent.base_url = "https://api.kimi.com/coding/v1"
|
||||
agent._base_url_lower = agent.base_url.lower()
|
||||
agent.model = "kimi-for-coding"
|
||||
messages = [{"role": "user", "content": "hi"}]
|
||||
|
||||
kwargs = agent._build_api_kwargs(messages)
|
||||
|
||||
assert kwargs["max_tokens"] == 32000
|
||||
assert kwargs["reasoning_effort"] == "medium"
|
||||
|
||||
def test_kimi_coding_endpoint_respects_custom_effort(self, agent):
|
||||
"""reasoning_effort should reflect reasoning_config.effort when set."""
|
||||
agent.base_url = "https://api.kimi.com/coding/v1"
|
||||
agent._base_url_lower = agent.base_url.lower()
|
||||
agent.model = "kimi-for-coding"
|
||||
agent.reasoning_config = {"enabled": True, "effort": "high"}
|
||||
messages = [{"role": "user", "content": "hi"}]
|
||||
|
||||
kwargs = agent._build_api_kwargs(messages)
|
||||
|
||||
assert kwargs["reasoning_effort"] == "high"
|
||||
|
||||
def test_kimi_coding_endpoint_sends_thinking_extra_body(self, agent):
|
||||
"""Kimi endpoint should send extra_body.thinking={"type":"enabled"}
|
||||
to activate reasoning mode, mirroring Kimi CLI's with_thinking()."""
|
||||
agent.base_url = "https://api.kimi.com/coding/v1"
|
||||
agent._base_url_lower = agent.base_url.lower()
|
||||
agent.model = "kimi-for-coding"
|
||||
messages = [{"role": "user", "content": "hi"}]
|
||||
|
||||
kwargs = agent._build_api_kwargs(messages)
|
||||
|
||||
assert kwargs["extra_body"]["thinking"] == {"type": "enabled"}
|
||||
|
||||
def test_kimi_coding_endpoint_disables_thinking(self, agent):
|
||||
"""When reasoning_config.enabled=False, thinking should be disabled
|
||||
and reasoning_effort should be omitted entirely — mirroring Kimi
|
||||
CLI's with_thinking("off") which maps to reasoning_effort=None."""
|
||||
agent.base_url = "https://api.kimi.com/coding/v1"
|
||||
agent._base_url_lower = agent.base_url.lower()
|
||||
agent.model = "kimi-for-coding"
|
||||
agent.reasoning_config = {"enabled": False}
|
||||
messages = [{"role": "user", "content": "hi"}]
|
||||
|
||||
kwargs = agent._build_api_kwargs(messages)
|
||||
|
||||
assert kwargs["extra_body"]["thinking"] == {"type": "disabled"}
|
||||
assert "reasoning_effort" not in kwargs
|
||||
|
||||
def test_moonshot_endpoint_sends_max_tokens_and_reasoning(self, agent):
|
||||
"""api.moonshot.ai should get the same Kimi-compatible params."""
|
||||
agent.base_url = "https://api.moonshot.ai/v1"
|
||||
agent._base_url_lower = agent.base_url.lower()
|
||||
agent.model = "kimi-k2.5"
|
||||
messages = [{"role": "user", "content": "hi"}]
|
||||
|
||||
kwargs = agent._build_api_kwargs(messages)
|
||||
|
||||
assert kwargs["max_tokens"] == 32000
|
||||
assert kwargs["reasoning_effort"] == "medium"
|
||||
assert kwargs["extra_body"]["thinking"] == {"type": "enabled"}
|
||||
|
||||
def test_moonshot_cn_endpoint_sends_max_tokens_and_reasoning(self, agent):
|
||||
"""api.moonshot.cn (China endpoint) should get the same params."""
|
||||
agent.base_url = "https://api.moonshot.cn/v1"
|
||||
agent._base_url_lower = agent.base_url.lower()
|
||||
agent.model = "kimi-k2.5"
|
||||
messages = [{"role": "user", "content": "hi"}]
|
||||
|
||||
kwargs = agent._build_api_kwargs(messages)
|
||||
|
||||
assert kwargs["max_tokens"] == 32000
|
||||
assert kwargs["reasoning_effort"] == "medium"
|
||||
assert kwargs["extra_body"]["thinking"] == {"type": "enabled"}
|
||||
|
||||
def test_provider_preferences_injected(self, agent):
|
||||
agent.base_url = "https://openrouter.ai/api/v1"
|
||||
agent.providers_allowed = ["Anthropic"]
|
||||
|
||||
Reference in New Issue
Block a user