fix(agents): fix lastMessage text extraction and tool_use/tool_result ordering

brandonkachen · brandonkachen · commit b72c9a8ddd67 · 2025-12-05T16:42:43.000-08:00
- extractLastMessageText now concatenates all assistant message text instead of
  only returning the last message's text. Due to streaming, each text chunk
  becomes a separate assistant message, so we need to join them all.

- Fixed stream-parser to not add text chunks to assistantMessages after tool
  calls have started, which was causing tool_use blocks to be separated from
  their tool_result blocks (Anthropic API violation).

- Updated extractSpawnResults and added extractSelectorResult helpers in
  editor-best-of-n for proper handling of lastMessage and structured outputs.
diff --git a/.agents/editor/best-of-n/editor-best-of-n.ts b/.agents/editor/best-of-n/editor-best-of-n.ts
@@ -110,20 +110,19 @@ function* handleStepsDefault({
   } satisfies ToolCall<'spawn_agents'>
 
   // Extract spawn results
-  const spawnedImplementations =
-    extractSpawnResults<{ text: string }[]>(implementorResults)
+  const spawnedImplementations = extractSpawnResults(implementorResults)
 
   logger.info({ spawnedImplementations }, 'spawnedImplementations')
 
-  // Extract all the plans from the structured outputs
+  // Extract all the plans from the lastMessage outputs
   const letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
   // Parse implementations from spawn results
   const implementations = spawnedImplementations.map((result, index) => ({
     id: letters[index],
     content:
       'errorMessage' in result
         ? `Error: ${result.errorMessage}`
-        : result[0].text,
+        : extractLastMessageText(result) ?? '',
   }))
 
   // Spawn selector with implementations as params
@@ -140,10 +139,7 @@ function* handleStepsDefault({
     includeToolCall: false,
   } satisfies ToolCall<'spawn_agents'>
 
-  const selectorOutput = extractSpawnResults<{
-    implementationId: string
-    reasoning: string
-  }>(selectorResult)[0]
+  const selectorOutput = extractSelectorResult(selectorResult)
 
   if ('errorMessage' in selectorOutput) {
     yield {
@@ -195,24 +191,83 @@ function* handleStepsDefault({
     includeToolCall: false,
   } satisfies ToolCall<'set_output'>
 
-  function extractSpawnResults<T>(
+  /**
+   * Extracts the array of subagent results from spawn_agents tool output.
+   *
+   * The spawn_agents tool result structure is:
+   * [{ type: 'json', value: [{ agentName, agentType, value: AgentOutput }] }]
+   *
+   * Returns an array of agent outputs, one per spawned agent.
+   */
+  function extractSpawnResults(results: any[] | undefined): any[] {
+    if (!results || results.length === 0) return []
+
+    // Find the json result containing spawn results
+    const jsonResult = results.find((r) => r.type === 'json')
+    if (!jsonResult?.value) return []
+
+    // Get the spawned agent results array
+    const spawnedResults = Array.isArray(jsonResult.value)
+      ? jsonResult.value
+      : [jsonResult.value]
+
+    // Extract the value (AgentOutput) from each result
+    return spawnedResults.map((result: any) => result?.value).filter(Boolean)
+  }
+
+  /**
+   * Extracts the structured output from a selector agent's spawn result.
+   * Selector agents use outputMode: 'structured_output'.
+   */
+  function extractSelectorResult(
     results: any[] | undefined,
-  ): (T | { errorMessage: string })[] {
-    if (!results) return []
-    const spawnedResults = results
-      .filter((result) => result.type === 'json')
-      .map((result) => result.value)
-      .flat() as {
-      agentType: string
-      value: { value?: T; errorMessage?: string }
-    }[]
-    return spawnedResults.map(
-      (result) =>
-        result.value.value ?? {
-          errorMessage:
-            result.value.errorMessage ?? 'Error extracting spawn results',
-        },
-    )
+  ): { implementationId: string; reasoning: string } | { errorMessage: string } {
+    const outputs = extractSpawnResults(results)
+    const firstOutput = outputs[0]
+    if (!firstOutput) {
+      return { errorMessage: 'No selector output' }
+    }
+    if (firstOutput.type === 'structuredOutput' && firstOutput.value) {
+      return firstOutput.value
+    }
+    if (firstOutput.type === 'error') {
+      return { errorMessage: firstOutput.message ?? 'Selector error' }
+    }
+    return { errorMessage: 'Invalid selector output format' }
+  }
+
+  /**
+   * Extracts all text content from a 'lastMessage' AgentOutput.
+   *
+   * For agents with outputMode: 'last_message', the output structure is:
+   * { type: 'lastMessage', value: [{ role: 'assistant', content: [{ type: 'text', text: '...' }] }] }
+   *
+   * Returns concatenated text from all assistant messages, or null if not found.
+   * Note: Due to streaming, each text chunk may be a separate assistant message,
+   * so we need to concatenate all of them to get the full response.
+   */
+  function extractLastMessageText(agentOutput: any): string | null {
+    if (!agentOutput) return null
+
+    // Handle 'lastMessage' output mode - the value contains an array of messages
+    if (
+      agentOutput.type === 'lastMessage' &&
+      Array.isArray(agentOutput.value)
+    ) {
+      // Collect text from all assistant messages (streaming creates multiple messages)
+      const textParts: string[] = []
+      for (const message of agentOutput.value) {
+        if (message.role === 'assistant' && Array.isArray(message.content)) {
+          for (const part of message.content) {
+            if (part.type === 'text' && typeof part.text === 'string') {
+              textParts.push(part.text)
+            }
+          }
+        }
+      }
+      return textParts.length > 0 ? textParts.join('') : null
+    }
+    return null
   }
 
   // Extract only tool calls from text, removing any commentary
@@ -395,12 +450,14 @@ function* handleStepsMax({
   }
 
   /**
-   * Extracts the text content from a 'lastMessage' AgentOutput.
+   * Extracts all text content from a 'lastMessage' AgentOutput.
    *
    * For agents with outputMode: 'last_message', the output structure is:
    * { type: 'lastMessage', value: [{ role: 'assistant', content: [{ type: 'text', text: '...' }] }] }
    *
-   * Returns the text from the last assistant message, or null if not found.
+   * Returns concatenated text from all assistant messages, or null if not found.
+   * Note: Due to streaming, each text chunk may be a separate assistant message,
+   * so we need to concatenate all of them to get the full response.
    */
   function extractLastMessageText(agentOutput: any): string | null {
     if (!agentOutput) return null
@@ -410,18 +467,18 @@ function* handleStepsMax({
       agentOutput.type === 'lastMessage' &&
       Array.isArray(agentOutput.value)
     ) {
-      // Find the last assistant message with text content
-      for (let i = agentOutput.value.length - 1; i >= 0; i--) {
-        const message = agentOutput.value[i]
+      // Collect text from all assistant messages (streaming creates multiple messages)
+      const textParts: string[] = []
+      for (const message of agentOutput.value) {
         if (message.role === 'assistant' && Array.isArray(message.content)) {
-          // Find text content in the message
           for (const part of message.content) {
             if (part.type === 'text' && typeof part.text === 'string') {
-              return part.text
+              textParts.push(part.text)
             }
           }
         }
       }
+      return textParts.length > 0 ? textParts.join('') : null
     }
     return null
   }
@@ -457,18 +514,17 @@ function* handleStepsOpus({
   } satisfies ToolCall<'spawn_agents'>
 
   // Extract spawn results
-  const spawnedImplementations =
-    extractSpawnResults<{ text: string }[]>(implementorResults)
+  const spawnedImplementations = extractSpawnResults(implementorResults)
 
-  // Extract all the plans from the structured outputs
+  // Extract all the plans from the lastMessage outputs
   const letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
   // Parse implementations from spawn results
   const implementations = spawnedImplementations.map((result, index) => ({
     id: letters[index],
     content:
       'errorMessage' in result
         ? `Error: ${result.errorMessage}`
-        : result[0].text,
+        : extractLastMessageText(result) ?? '',
   }))
 
   // Spawn selector with implementations as params
@@ -485,10 +541,7 @@ function* handleStepsOpus({
     includeToolCall: false,
   } satisfies ToolCall<'spawn_agents'>
 
-  const selectorOutput = extractSpawnResults<{
-    implementationId: string
-    reasoning: string
-  }>(selectorResult)[0]
+  const selectorOutput = extractSelectorResult(selectorResult)
 
   if ('errorMessage' in selectorOutput) {
     yield {
@@ -540,24 +593,83 @@ function* handleStepsOpus({
     includeToolCall: false,
   } satisfies ToolCall<'set_output'>
 
-  function extractSpawnResults<T>(
+  /**
+   * Extracts the array of subagent results from spawn_agents tool output.
+   *
+   * The spawn_agents tool result structure is:
+   * [{ type: 'json', value: [{ agentName, agentType, value: AgentOutput }] }]
+   *
+   * Returns an array of agent outputs, one per spawned agent.
+   */
+  function extractSpawnResults(results: any[] | undefined): any[] {
+    if (!results || results.length === 0) return []
+
+    // Find the json result containing spawn results
+    const jsonResult = results.find((r) => r.type === 'json')
+    if (!jsonResult?.value) return []
+
+    // Get the spawned agent results array
+    const spawnedResults = Array.isArray(jsonResult.value)
+      ? jsonResult.value
+      : [jsonResult.value]
+
+    // Extract the value (AgentOutput) from each result
+    return spawnedResults.map((result: any) => result?.value).filter(Boolean)
+  }
+
+  /**
+   * Extracts the structured output from a selector agent's spawn result.
+   * Selector agents use outputMode: 'structured_output'.
+   */
+  function extractSelectorResult(
     results: any[] | undefined,
-  ): (T | { errorMessage: string })[] {
-    if (!results) return []
-    const spawnedResults = results
-      .filter((result) => result.type === 'json')
-      .map((result) => result.value)
-      .flat() as {
-      agentType: string
-      value: { value?: T; errorMessage?: string }
-    }[]
-    return spawnedResults.map(
-      (result) =>
-        result.value.value ?? {
-          errorMessage:
-            result.value.errorMessage ?? 'Error extracting spawn results',
-        },
-    )
+  ): { implementationId: string; reasoning: string } | { errorMessage: string } {
+    const outputs = extractSpawnResults(results)
+    const firstOutput = outputs[0]
+    if (!firstOutput) {
+      return { errorMessage: 'No selector output' }
+    }
+    if (firstOutput.type === 'structuredOutput' && firstOutput.value) {
+      return firstOutput.value
+    }
+    if (firstOutput.type === 'error') {
+      return { errorMessage: firstOutput.message ?? 'Selector error' }
+    }
+    return { errorMessage: 'Invalid selector output format' }
+  }
+
+  /**
+   * Extracts all text content from a 'lastMessage' AgentOutput.
+   *
+   * For agents with outputMode: 'last_message', the output structure is:
+   * { type: 'lastMessage', value: [{ role: 'assistant', content: [{ type: 'text', text: '...' }] }] }
+   *
+   * Returns concatenated text from all assistant messages, or null if not found.
+   * Note: Due to streaming, each text chunk may be a separate assistant message,
+   * so we need to concatenate all of them to get the full response.
+   */
+  function extractLastMessageText(agentOutput: any): string | null {
+    if (!agentOutput) return null
+
+    // Handle 'lastMessage' output mode - the value contains an array of messages
+    if (
+      agentOutput.type === 'lastMessage' &&
+      Array.isArray(agentOutput.value)
+    ) {
+      // Collect text from all assistant messages (streaming creates multiple messages)
+      const textParts: string[] = []
+      for (const message of agentOutput.value) {
+        if (message.role === 'assistant' && Array.isArray(message.content)) {
+          for (const part of message.content) {
+            if (part.type === 'text' && typeof part.text === 'string') {
+              textParts.push(part.text)
+            }
+          }
+        }
+      }
+      return textParts.length > 0 ? textParts.join('') : null
+    }
+    return null
   }
 
   // Extract only tool calls from text, removing any commentary
diff --git a/.agents/file-explorer/file-picker.ts b/.agents/file-explorer/file-picker.ts
@@ -113,32 +113,34 @@ Do not use any further tools or spawn any further agents.
     }
 
     /**
-     * Extracts the text content from a 'lastMessage' AgentOutput.
-     * 
+     * Extracts all text content from a 'lastMessage' AgentOutput.
+     *
      * For agents with outputMode: 'last_message', the output structure is:
      * { type: 'lastMessage', value: [{ role: 'assistant', content: [{ type: 'text', text: '...' }] }] }
-     * 
-     * Returns the text from the last assistant message, or null if not found.
+     *
+     * Returns concatenated text from all assistant messages, or null if not found.
+     * Note: Due to streaming, each text chunk may be a separate assistant message,
+     * so we need to concatenate all of them to get the full response.
      */
     function extractLastMessageText(agentOutput: any): string | null {
       if (!agentOutput) return null
-      
+
       // Handle 'lastMessage' output mode - the value contains an array of messages
       if (agentOutput.type === 'lastMessage' && Array.isArray(agentOutput.value)) {
-        // Find the last assistant message with text content
-        for (let i = agentOutput.value.length - 1; i >= 0; i--) {
-          const message = agentOutput.value[i]
+        // Collect text from all assistant messages (streaming creates multiple messages)
+        const textParts: string[] = []
+        for (const message of agentOutput.value) {
           if (message.role === 'assistant' && Array.isArray(message.content)) {
-            // Find text content in the message
             for (const part of message.content) {
               if (part.type === 'text' && typeof part.text === 'string') {
-                return part.text
+                textParts.push(part.text)
               }
             }
           }
         }
+        return textParts.length > 0 ? textParts.join('') : null
       }
-      
+
       return null
     }
 
diff --git a/packages/agent-runtime/src/tools/stream-parser.ts b/packages/agent-runtime/src/tools/stream-parser.ts
@@ -258,7 +258,11 @@ export async function processStream(
     } else if (chunk.type === 'text') {
       onResponseChunk(chunk.text)
       fullResponseChunks.push(chunk.text)
-      assistantMessages.push(assistantMessage(chunk.text))
+      // Only add text as assistant message if no tool calls have been made yet
+      // to avoid separating tool_use from tool_result (Anthropic API requirement)
+      if (toolCalls.length === 0) {
+        assistantMessages.push(assistantMessage(chunk.text))
+      }
     } else if (chunk.type === 'error') {
       onResponseChunk(chunk)