openai · saksham-1304 · Dec 25, 2025 · Dec 25, 2025 · Dec 27, 2025 · Dec 27, 2025
diff --git a/src/agents/handoffs/history.py b/src/agents/handoffs/history.py
@@ -21,6 +21,10 @@
     "set_conversation_history_wrappers",
 ]
 
+# Content types that represent multimodal data (images, files, audio) which should be
+# preserved during handoffs rather than being converted to text summaries.
+_MULTIMODAL_CONTENT_TYPES = frozenset({"input_image", "input_file", "input_audio"})
+
 _DEFAULT_CONVERSATION_HISTORY_START = "<CONVERSATION HISTORY>"
 _DEFAULT_CONVERSATION_HISTORY_END = "</CONVERSATION HISTORY>"
 _conversation_history_start = _DEFAULT_CONVERSATION_HISTORY_START
@@ -90,10 +94,28 @@ def nest_handoff_history(
 def default_handoff_history_mapper(
     transcript: list[TResponseInputItem],
 ) -> list[TResponseInputItem]:
-    """Return a single assistant message summarizing the transcript."""
+    """Return a summary of the transcript, preserving multimodal content.
+
+    The returned list contains:
+    1. An assistant message summarizing the text conversation
+    2. A user message with any multimodal content (images, files, audio) if present
 
+    This ensures that multimodal content uploaded by users is preserved during handoffs.
+    """
+    multimodal_content = _extract_multimodal_content(transcript)
     summary_message = _build_summary_message(transcript)
-    return [summary_message]
+
+    result: list[TResponseInputItem] = [summary_message]
+
+    # If there's multimodal content, add it as a user message so the next agent can see it.
+    if multimodal_content:
+        user_message: dict[str, Any] = {
+            "role": "user",
+            "content": multimodal_content,
+        }
+        result.append(cast(TResponseInputItem, user_message))
+
+    return result
 
 
 def _normalize_input_history(
@@ -157,12 +179,61 @@ def _stringify_content(content: Any) -> str:
         return ""
     if isinstance(content, str):
         return content
+    # Handle multimodal content (list of content parts).
+    if isinstance(content, list):
+        return _stringify_content_list(content)
     try:
         return json.dumps(content, ensure_ascii=False, default=str)
     except TypeError:
         return str(content)
 
 
+def _stringify_content_list(content_list: list[Any]) -> str:
+    """Convert a list of content parts to a human-readable string.
+
+    For multimodal content, this provides a summary that indicates the presence
+    of images, files, and audio without including their binary data.
+    """
+    parts: list[str] = []
+    image_count = 0
+    file_count = 0
+    audio_count = 0
+
+    for part in content_list:
+        if isinstance(part, dict):
+            part_type = part.get("type")
+            if part_type == "input_text":
+                text = part.get("text", "")
+                if text:
+                    parts.append(text)
+            elif part_type == "input_image":
+                image_count += 1
+            elif part_type == "input_file":
+                file_count += 1
+            elif part_type == "input_audio":
+                audio_count += 1
+            else:
+                # Unknown type, try to stringify it.
+                try:
+                    parts.append(json.dumps(part, ensure_ascii=False, default=str))
+                except TypeError:
+                    parts.append(str(part))
+        elif isinstance(part, str):
+            parts.append(part)
+        else:
+            parts.append(str(part))
+
+    # Add indicators for multimodal content.
+    if image_count > 0:
+        parts.append(f"[{image_count} image(s) attached]")
+    if file_count > 0:
+        parts.append(f"[{file_count} file(s) attached]")
+    if audio_count > 0:
+        parts.append(f"[{audio_count} audio file(s) attached]")
+
+    return " ".join(parts)
+
+
 def _flatten_nested_history_messages(
     items: list[TResponseInputItem],
 ) -> list[TResponseInputItem]:
@@ -234,3 +305,37 @@ def _split_role_and_name(role_text: str) -> tuple[str, str | None]:
 def _get_run_item_role(run_item: RunItem) -> str | None:
     role_candidate = run_item.to_input_item().get("role")
     return role_candidate if isinstance(role_candidate, str) else None
+
+
+def _extract_multimodal_content(
+    transcript: list[TResponseInputItem],
+) -> list[dict[str, Any]]:
+    """Extract multimodal content (images, files, audio) from user messages in the transcript.
+
+    This function scans through all user messages and extracts any multimodal content parts
+    (input_image, input_file, input_audio) so they can be preserved during handoffs.
+
+    Returns:
+        A list of multimodal content items, or an empty list if none found.
+    """
+    multimodal_parts: list[dict[str, Any]] = []
+
+    for item in transcript:
+        # Only extract multimodal content from user messages.
+        role = item.get("role")
+        if role != "user":
+            continue
+
+        content = item.get("content")
+        if content is None:
+            continue
+
+        # If content is a list, check each part for multimodal types.
+        if isinstance(content, list):
+            for part in content:
+                if isinstance(part, dict):
+                    part_type = part.get("type")
+                    if part_type in _MULTIMODAL_CONTENT_TYPES:
+                        multimodal_parts.append(deepcopy(part))
+
+    return multimodal_parts
diff --git a/tests/test_extension_filters.py b/tests/test_extension_filters.py
@@ -398,3 +398,216 @@ def map_history(items: list[TResponseInputItem]) -> list[TResponseInputItem]:
     )
     assert second["role"] == "user"
     assert second["content"] == "Hello"
+
+
+def _get_user_input_item_with_image(text: str, image_url: str) -> TResponseInputItem:
+    """Create a user input item with both text and an image."""
+    return {
+        "role": "user",
+        "content": [
+            {"type": "input_text", "text": text},
+            {"type": "input_image", "image_url": image_url, "detail": "auto"},
+        ],
+    }
+
+
+def _get_user_input_item_with_file(text: str, file_data: str) -> TResponseInputItem:
+    """Create a user input item with both text and a file."""
+    return {
+        "role": "user",
+        "content": [
+            {"type": "input_text", "text": text},
+            {"type": "input_file", "file_data": file_data, "filename": "test.txt"},
+        ],
+    }
+
+
+def _get_user_input_item_image_only(image_url: str) -> TResponseInputItem:
+    """Create a user input item with only an image (no text)."""
+    return {
+        "role": "user",
+        "content": [
+            {"type": "input_image", "image_url": image_url, "detail": "high"},
+        ],
+    }
+
+
+def test_nest_handoff_history_preserves_image_content() -> None:
+    """Test that image content from user messages is preserved during handoff."""
+    image_url = "https://example.com/test-image.jpg"
+    data = HandoffInputData(
+        input_history=(_get_user_input_item_with_image("What's in this image?", image_url),),
+        pre_handoff_items=(_get_message_output_run_item("I see an image"),),
+        new_items=(),
+        run_context=RunContextWrapper(context=()),
+    )
+
+    nested = nest_handoff_history(data)
+
+    assert isinstance(nested.input_history, tuple)
+    # Should have 2 items: summary message + user message with image.
+    assert len(nested.input_history) == 2
+
+    # First item should be the summary.
+    summary = _as_message(nested.input_history[0])
+    assert summary["role"] == "assistant"
+    summary_content = summary["content"]
+    assert isinstance(summary_content, str)
+    assert "What's in this image?" in summary_content
+    assert "[1 image(s) attached]" in summary_content
+
+    # Second item should be the preserved image content.
+    image_msg = _as_message(nested.input_history[1])
+    assert image_msg["role"] == "user"
+    image_content = image_msg["content"]
+    assert isinstance(image_content, list)
+    assert len(image_content) == 1
+    assert image_content[0]["type"] == "input_image"
+    assert image_content[0]["image_url"] == image_url
+    assert image_content[0]["detail"] == "auto"
+
+
+def test_nest_handoff_history_preserves_file_content() -> None:
+    """Test that file content from user messages is preserved during handoff."""
+    file_data = "base64encodeddata"
+    data = HandoffInputData(
+        input_history=(_get_user_input_item_with_file("Analyze this file", file_data),),
+        pre_handoff_items=(_get_message_output_run_item("Analyzing file"),),
+        new_items=(),
+        run_context=RunContextWrapper(context=()),
+    )
+
+    nested = nest_handoff_history(data)
+
+    assert isinstance(nested.input_history, tuple)
+    assert len(nested.input_history) == 2
+
+    # First item should be the summary.
+    summary = _as_message(nested.input_history[0])
+    summary_content = summary["content"]
+    assert isinstance(summary_content, str)
+    assert "[1 file(s) attached]" in summary_content
+
+    # Second item should be the preserved file content.
+    file_msg = _as_message(nested.input_history[1])
+    assert file_msg["role"] == "user"
+    file_content = file_msg["content"]
+    assert isinstance(file_content, list)
+    assert len(file_content) == 1
+    assert file_content[0]["type"] == "input_file"
+    assert file_content[0]["file_data"] == file_data
+
+
+def test_nest_handoff_history_preserves_multiple_images() -> None:
+    """Test that multiple images from different user messages are preserved."""
+    image_url1 = "https://example.com/image1.jpg"
+    image_url2 = "https://example.com/image2.jpg"
+    data = HandoffInputData(
+        input_history=(
+            _get_user_input_item_image_only(image_url1),
+            _get_user_input_item_image_only(image_url2),
+        ),
+        pre_handoff_items=(_get_message_output_run_item("Two images received"),),
+        new_items=(),
+        run_context=RunContextWrapper(context=()),
+    )
+
+    nested = nest_handoff_history(data)
+
+    assert isinstance(nested.input_history, tuple)
+    assert len(nested.input_history) == 2
+
+    # Second item should contain both images.
+    image_msg = _as_message(nested.input_history[1])
+    assert image_msg["role"] == "user"
+    image_content = image_msg["content"]
+    assert isinstance(image_content, list)
+    assert len(image_content) == 2
+    assert image_content[0]["type"] == "input_image"
+    assert image_content[0]["image_url"] == image_url1
+    assert image_content[1]["type"] == "input_image"
+    assert image_content[1]["image_url"] == image_url2
+
+
+def test_nest_handoff_history_no_multimodal_single_message() -> None:
+    """Test that text-only messages result in a single summary message."""
+    data = HandoffInputData(
+        input_history=(_get_user_input_item("Hello, how are you?"),),
+        pre_handoff_items=(_get_message_output_run_item("I am fine"),),
+        new_items=(),
+        run_context=RunContextWrapper(context=()),
+    )
+
+    nested = nest_handoff_history(data)
+
+    assert isinstance(nested.input_history, tuple)
+    # Should only have 1 item (no multimodal content).
+    assert len(nested.input_history) == 1
+    summary = _as_message(nested.input_history[0])
+    assert summary["role"] == "assistant"
+
+
+def test_nest_handoff_history_ignores_multimodal_in_assistant_messages() -> None:
+    """Test that multimodal content in non-user messages is not extracted.
+
+    Only user-uploaded content should be preserved, not content from assistant responses.
+    """
+    # Create an assistant message that somehow has image content.
+    assistant_with_image: TResponseInputItem = {  # type: ignore[misc,assignment]
+        "role": "assistant",
+        "content": [
+            {"type": "output_text", "text": "Here is the image"},
+            {"type": "input_image", "image_url": "https://example.com/generated.jpg"},
+        ],
+    }
+    data = HandoffInputData(
+        input_history=(assistant_with_image,),
+        pre_handoff_items=(),
+        new_items=(),
+        run_context=RunContextWrapper(context=()),
+    )
+
+    nested = nest_handoff_history(data)
+
+    assert isinstance(nested.input_history, tuple)
+    # Should only have 1 item - no additional user message with multimodal content.
+    assert len(nested.input_history) == 1
+    summary = _as_message(nested.input_history[0])
+    assert summary["role"] == "assistant"
+
+
+def test_nest_handoff_history_preserves_audio_content() -> None:
+    """Test that audio content from user messages is preserved during handoff."""
+    audio_data = "base64audiocontent"
+    user_with_audio: TResponseInputItem = {  # type: ignore[misc,assignment]
+        "role": "user",
+        "content": [
+            {"type": "input_text", "text": "Listen to this"},
+            {"type": "input_audio", "input_audio": {"data": audio_data, "format": "mp3"}},
+        ],
+    }
+    data = HandoffInputData(
+        input_history=(user_with_audio,),
+        pre_handoff_items=(_get_message_output_run_item("Audio received"),),
+        new_items=(),
+        run_context=RunContextWrapper(context=()),
+    )
+
+    nested = nest_handoff_history(data)
+
+    assert isinstance(nested.input_history, tuple)
+    assert len(nested.input_history) == 2
+
+    # Check summary mentions audio.
+    summary = _as_message(nested.input_history[0])
+    summary_content = summary["content"]
+    assert isinstance(summary_content, str)
+    assert "[1 audio file(s) attached]" in summary_content
+
+    # Check audio is preserved.
+    audio_msg = _as_message(nested.input_history[1])
+    assert audio_msg["role"] == "user"
+    audio_content = audio_msg["content"]
+    assert isinstance(audio_content, list)
+    assert len(audio_content) == 1
+    assert audio_content[0]["type"] == "input_audio"