fix(anthropic): demote dead thinking signature when orphan-strip mutates the latest turn

Extended-thinking Claude models (4.6+, e.g. Opus 4.8) emit a signed `thinking`
block on assistant turns that also carry parallel `tool_use` blocks. Anthropic
signs that block against the full, original turn content.

When a parallel tool batch is interrupted before every `tool_result` returns,
`_strip_orphaned_tool_blocks` removes the unanswered `tool_use` on replay — which
mutates the turn. The latest-assistant branch of `_manage_thinking_signatures`
then replays the now-stale signed thinking block verbatim, and Anthropic rejects
the request with a non-retryable HTTP 400:

    messages.N.content.M: `thinking` or `redacted_thinking` blocks in the latest
    assistant message cannot be modified. These blocks must remain as they were
    in the original response.

Because the poisoned turn is rebuilt from the persisted store every turn, the
gateway crash-loops with no self-recovery (a soft session reset does not clear
it). The drifting content index in the error is the changing count of stripped
`tool_use` blocks across rebuilds.

Fix: when orphan-stripping removes a `tool_use` from a turn that also holds a
thinking/redacted_thinking block, flag the turn. `_manage_thinking_signatures`
then demotes every thinking block on that latest turn to a plain text block
(preserving the reasoning text) instead of replaying a signature that can no
longer validate. An intact turn is unaffected — its signed thinking is still
replayed verbatim. The internal flag is stripped before the payload is sent.

Adds two regression tests:
- demotion when an orphaned parallel tool_use is stripped
- control: signed thinking preserved verbatim when nothing is stripped
This commit is contained in:
fesalfayed
2026-05-31 08:22:17 -04:00
committed by Teknium
parent 2b5268f716
commit 64628ea89b
2 changed files with 110 additions and 1 deletions

View File

@ -1783,11 +1783,25 @@ def _strip_orphaned_tool_blocks(result: List[Dict[str, Any]]) -> None:
tool_result_ids.add(block.get("tool_use_id"))
for m in result:
if m["role"] == "assistant" and isinstance(m["content"], list):
m["content"] = [
kept = [
b
for b in m["content"]
if b.get("type") != "tool_use" or b.get("id") in tool_result_ids
]
# If stripping an orphaned tool_use mutated a turn that also carries a
# signed thinking block, that block's Anthropic signature was computed
# against the ORIGINAL (un-stripped) turn content and is now invalid.
# Anthropic rejects the replayed turn with HTTP 400 "thinking blocks in
# the latest assistant message cannot be modified". Flag the turn so
# _manage_thinking_signatures can demote the dead signature instead of
# replaying it verbatim. See hermes-agent: extended-thinking + parallel
# tool batch interrupted mid-flight → non-retryable 400 crash-loop.
if len(kept) != len(m["content"]) and any(
isinstance(b, dict) and b.get("type") in {"thinking", "redacted_thinking"}
for b in m["content"]
):
m["_thinking_signature_invalidated"] = True
m["content"] = kept
if not m["content"]:
m["content"] = [{"type": "text", "text": "(tool call removed)"}]
@ -1832,6 +1846,10 @@ def _merge_consecutive_roles(result: List[Dict[str, Any]]) -> List[Dict[str, Any
fixed[-1]["content"] = prev_content + curr_content
else:
# Consecutive assistant messages — merge text content.
# Propagate the orphan-strip signature-invalidation flag onto the
# surviving (prev) dict so _manage_thinking_signatures still sees it.
if m.get("_thinking_signature_invalidated"):
fixed[-1]["_thinking_signature_invalidated"] = True
# Drop thinking blocks from the *second* message: their
# signature was computed against a different turn boundary
# and becomes invalid once merged.
@ -1920,11 +1938,26 @@ def _manage_thinking_signatures(
else:
# Latest assistant on direct Anthropic: keep signed, downgrade unsigned
# to text so the reasoning isn't lost.
#
# Exception: if orphan-stripping (or another structural mutation) removed
# a tool_use block from THIS turn, every thinking signature on it was
# computed against the original turn content and is now dead. Anthropic
# rejects the turn either way — replaying the signed block 400s with
# "thinking blocks in the latest assistant message cannot be modified",
# and a bare signed block with no following tool_use is also invalid.
# Demote ALL thinking blocks on this turn to text so the turn replays
# cleanly and the model can re-plan from the surviving tool results.
signature_dead = bool(m.get("_thinking_signature_invalidated"))
new_content = []
for b in m["content"]:
if not isinstance(b, dict) or b.get("type") not in _THINKING_TYPES:
new_content.append(b)
continue
if signature_dead:
thinking_text = b.get("thinking", "")
if thinking_text:
new_content.append({"type": "text", "text": thinking_text})
continue
if b.get("type") == "redacted_thinking":
# Redacted blocks use 'data' for the signature payload —
# drop the block when 'data' is missing (can't be validated).
@ -1944,6 +1977,9 @@ def _manage_thinking_signatures(
if isinstance(b, dict) and b.get("type") in _THINKING_TYPES:
b.pop("cache_control", None)
# Drop the internal bookkeeping flag — it must never reach the API payload.
m.pop("_thinking_signature_invalidated", None)
def _evict_old_screenshots(result: List[Dict[str, Any]]) -> None:
"""Keep only the most recent ``_MAX_KEEP_IMAGES`` computer-use screenshots.

View File

@ -1827,6 +1827,79 @@ class TestThinkingBlockSignatureManagement:
assert len(last_thinking) == 1
assert last_thinking[0]["signature"] == "sig_3"
def test_orphan_stripped_tool_use_demotes_dead_signed_thinking(self):
"""Regression: extended-thinking + interrupted parallel tool batch.
An assistant turn with a signed thinking block fires several parallel
tool_use blocks, but the batch is interrupted before every tool_result
comes back. On replay, the orphaned tool_use is stripped — which mutates
the turn and invalidates the thinking-block signature (it was computed
against the original, un-stripped content). Anthropic then rejects the
turn with HTTP 400 "thinking blocks in the latest assistant message
cannot be modified", a non-retryable error that crash-loops the gateway.
The signed thinking block on the mutated latest turn must be demoted to
a plain text block so the turn replays cleanly.
"""
messages = [
{
"role": "assistant",
"content": "",
"tool_calls": [
{"id": "tc_kept", "function": {"name": "tool_a", "arguments": "{}"}},
{"id": "tc_orphan", "function": {"name": "tool_b", "arguments": "{}"}},
],
"reasoning_details": [
{"type": "thinking", "thinking": "Plan: call A and B.", "signature": "sig_dead"},
],
},
# Only one of the two parallel tool_use blocks got a result back.
{"role": "tool", "tool_call_id": "tc_kept", "content": "result A"},
]
_, result = convert_messages_to_anthropic(messages)
assistant = next(m for m in result if m["role"] == "assistant")
blocks = assistant["content"]
# No signed thinking block survives — the signature is dead.
assert not any(
isinstance(b, dict) and b.get("type") in {"thinking", "redacted_thinking"}
for b in blocks
)
# The reasoning text is preserved as a text block (not silently lost).
text_contents = [b.get("text", "") for b in blocks if b.get("type") == "text"]
assert "Plan: call A and B." in text_contents
# The orphaned tool_use is gone; the answered one survives.
tool_use_ids = [b.get("id") for b in blocks if b.get("type") == "tool_use"]
assert tool_use_ids == ["tc_kept"]
# Internal bookkeeping flag must never leak into the API payload.
assert "_thinking_signature_invalidated" not in assistant
def test_signed_thinking_preserved_when_no_tool_use_stripped(self):
"""Control: an intact latest turn keeps its signed thinking verbatim.
This guards against the orphan-strip fix over-firing — when no tool_use
is removed, the signature is still valid and must be replayed as-is.
"""
messages = [
{
"role": "assistant",
"content": "",
"tool_calls": [
{"id": "tc_1", "function": {"name": "tool_a", "arguments": "{}"}},
],
"reasoning_details": [
{"type": "thinking", "thinking": "Valid plan.", "signature": "sig_live"},
],
},
{"role": "tool", "tool_call_id": "tc_1", "content": "result A"},
]
_, result = convert_messages_to_anthropic(messages)
assistant = next(m for m in result if m["role"] == "assistant")
thinking = [b for b in assistant["content"] if b.get("type") == "thinking"]
assert len(thinking) == 1
assert thinking[0]["signature"] == "sig_live"
assert "_thinking_signature_invalidated" not in assistant
# ---------------------------------------------------------------------------
# Tool choice