fix(anthropic): demote dead thinking signature when orphan-strip mutates the latest turn

Extended-thinking Claude models (4.6+, e.g. Opus 4.8) emit a signed `thinking` block on assistant turns that also carry parallel `tool_use` blocks. Anthropic signs that block against the full, original turn content. When a parallel tool batch is interrupted before every `tool_result` returns, `_strip_orphaned_tool_blocks` removes the unanswered `tool_use` on replay — which mutates the turn. The latest-assistant branch of `_manage_thinking_signatures` then replays the now-stale signed thinking block verbatim, and Anthropic rejects the request with a non-retryable HTTP 400: messages.N.content.M: `thinking` or `redacted_thinking` blocks in the latest assistant message cannot be modified. These blocks must remain as they were in the original response. Because the poisoned turn is rebuilt from the persisted store every turn, the gateway crash-loops with no self-recovery (a soft session reset does not clear it). The drifting content index in the error is the changing count of stripped `tool_use` blocks across rebuilds. Fix: when orphan-stripping removes a `tool_use` from a turn that also holds a thinking/redacted_thinking block, flag the turn. `_manage_thinking_signatures` then demotes every thinking block on that latest turn to a plain text block (preserving the reasoning text) instead of replaying a signature that can no longer validate. An intact turn is unaffected — its signed thinking is still replayed verbatim. The internal flag is stripped before the payload is sent. Adds two regression tests: - demotion when an orphaned parallel tool_use is stripped - control: signed thinking preserved verbatim when nothing is stripped
2026-05-31 08:22:17 -04:00
parent 2b5268f716
commit 64628ea89b
2 changed files with 110 additions and 1 deletions
--- a/agent/anthropic_adapter.py
+++ b/agent/anthropic_adapter.py
@ -1783,11 +1783,25 @@ def _strip_orphaned_tool_blocks(result: List[Dict[str, Any]]) -> None:
                    tool_result_ids.add(block.get("tool_use_id"))
    for m in result:
        if m["role"] == "assistant" and isinstance(m["content"], list):
-            m["content"] = [
+            kept = [
                b
                for b in m["content"]
                if b.get("type") != "tool_use" or b.get("id") in tool_result_ids
            ]
+            # If stripping an orphaned tool_use mutated a turn that also carries a
+            # signed thinking block, that block's Anthropic signature was computed
+            # against the ORIGINAL (un-stripped) turn content and is now invalid.
+            # Anthropic rejects the replayed turn with HTTP 400 "thinking blocks in
+            # the latest assistant message cannot be modified".  Flag the turn so
+            # _manage_thinking_signatures can demote the dead signature instead of
+            # replaying it verbatim.  See hermes-agent: extended-thinking + parallel
+            # tool batch interrupted mid-flight → non-retryable 400 crash-loop.
+            if len(kept) != len(m["content"]) and any(
+                isinstance(b, dict) and b.get("type") in {"thinking", "redacted_thinking"}
+                for b in m["content"]
+            ):
+                m["_thinking_signature_invalidated"] = True
+            m["content"] = kept
            if not m["content"]:
                m["content"] = [{"type": "text", "text": "(tool call removed)"}]

@ -1832,6 +1846,10 @@ def _merge_consecutive_roles(result: List[Dict[str, Any]]) -> List[Dict[str, Any
                    fixed[-1]["content"] = prev_content + curr_content
            else:
                # Consecutive assistant messages — merge text content.
+                # Propagate the orphan-strip signature-invalidation flag onto the
+                # surviving (prev) dict so _manage_thinking_signatures still sees it.
+                if m.get("_thinking_signature_invalidated"):
+                    fixed[-1]["_thinking_signature_invalidated"] = True
                # Drop thinking blocks from the *second* message: their
                # signature was computed against a different turn boundary
                # and becomes invalid once merged.
@ -1920,11 +1938,26 @@ def _manage_thinking_signatures(
        else:
            # Latest assistant on direct Anthropic: keep signed, downgrade unsigned
            # to text so the reasoning isn't lost.
+            #
+            # Exception: if orphan-stripping (or another structural mutation) removed
+            # a tool_use block from THIS turn, every thinking signature on it was
+            # computed against the original turn content and is now dead.  Anthropic
+            # rejects the turn either way — replaying the signed block 400s with
+            # "thinking blocks in the latest assistant message cannot be modified",
+            # and a bare signed block with no following tool_use is also invalid.
+            # Demote ALL thinking blocks on this turn to text so the turn replays
+            # cleanly and the model can re-plan from the surviving tool results.
+            signature_dead = bool(m.get("_thinking_signature_invalidated"))
            new_content = []
            for b in m["content"]:
                if not isinstance(b, dict) or b.get("type") not in _THINKING_TYPES:
                    new_content.append(b)
                    continue
+                if signature_dead:
+                    thinking_text = b.get("thinking", "")
+                    if thinking_text:
+                        new_content.append({"type": "text", "text": thinking_text})
+                    continue
                if b.get("type") == "redacted_thinking":
                    # Redacted blocks use 'data' for the signature payload —
                    # drop the block when 'data' is missing (can't be validated).
@ -1944,6 +1977,9 @@ def _manage_thinking_signatures(
            if isinstance(b, dict) and b.get("type") in _THINKING_TYPES:
                b.pop("cache_control", None)

+        # Drop the internal bookkeeping flag — it must never reach the API payload.
+        m.pop("_thinking_signature_invalidated", None)
+

 def _evict_old_screenshots(result: List[Dict[str, Any]]) -> None:
    """Keep only the most recent ``_MAX_KEEP_IMAGES`` computer-use screenshots.