From 33b694afc373e7bbda1c8e8ea34d3111f74cfbe2 Mon Sep 17 00:00:00 2001
From: Saksham Singh Rathore <sakshamsinghrathore1304@gmail.com>
Date: Thu, 25 Dec 2025 19:02:28 +0530
Subject: [PATCH 1/3] fix: preserve multimodal content during handoffs with
 nest_handoff_history

When nest_handoff_history=True (the default), multimodal content (images,
files, audio) from user messages was being lost during handoffs because
the content was converted to a plain text summary.

This fix:
- Extracts multimodal content from user messages before summarization
- Adds the multimodal content as a separate user message after the summary
- Improves the text summary to show '[N image(s) attached]' instead of raw JSON

Fixes issue where target agents lose access to uploaded images during
same-turn handoffs.
---
 src/agents/handoffs/history.py  | 109 +++++++++++++++-
 tests/test_extension_filters.py | 213 ++++++++++++++++++++++++++++++++
 2 files changed, 320 insertions(+), 2 deletions(-)
diff --git a/src/agents/handoffs/history.py b/src/agents/handoffs/history.py
index dc59547fb..14f35dfa0 100644
--- a/src/agents/handoffs/history.py
+++ b/src/agents/handoffs/history.py
@@ -21,6 +21,10 @@
     "set_conversation_history_wrappers",
 ]
 
+# Content types that represent multimodal data (images, files, audio) which should be
+# preserved during handoffs rather than being converted to text summaries.
+_MULTIMODAL_CONTENT_TYPES = frozenset({"input_image", "input_file", "input_audio"})
+
 _DEFAULT_CONVERSATION_HISTORY_START = "<CONVERSATION HISTORY>"
 _DEFAULT_CONVERSATION_HISTORY_END = "</CONVERSATION HISTORY>"
 _conversation_history_start = _DEFAULT_CONVERSATION_HISTORY_START
@@ -90,10 +94,28 @@ def nest_handoff_history(
 def default_handoff_history_mapper(
     transcript: list[TResponseInputItem],
 ) -> list[TResponseInputItem]:
-    """Return a single assistant message summarizing the transcript."""
+    """Return a summary of the transcript, preserving multimodal content.
+
+    The returned list contains:
+    1. An assistant message summarizing the text conversation
+    2. A user message with any multimodal content (images, files, audio) if present
 
+    This ensures that multimodal content uploaded by users is preserved during handoffs.
+    """
+    multimodal_content = _extract_multimodal_content(transcript)
     summary_message = _build_summary_message(transcript)
-    return [summary_message]
+
+    result: list[TResponseInputItem] = [summary_message]
+
+    # If there's multimodal content, add it as a user message so the next agent can see it.
+    if multimodal_content:
+        user_message: dict[str, Any] = {
+            "role": "user",
+            "content": multimodal_content,
+        }
+        result.append(cast(TResponseInputItem, user_message))
+
+    return result
 
 
 def _normalize_input_history(
@@ -157,12 +179,61 @@ def _stringify_content(content: Any) -> str:
         return ""
     if isinstance(content, str):
         return content
+    # Handle multimodal content (list of content parts).
+    if isinstance(content, list):
+        return _stringify_content_list(content)
     try:
         return json.dumps(content, ensure_ascii=False, default=str)
     except TypeError:
         return str(content)
 
 
+def _stringify_content_list(content_list: list[Any]) -> str:
+    """Convert a list of content parts to a human-readable string.
+
+    For multimodal content, this provides a summary that indicates the presence
+    of images, files, and audio without including their binary data.
+    """
+    parts: list[str] = []
+    image_count = 0
+    file_count = 0
+    audio_count = 0
+
+    for part in content_list:
+        if isinstance(part, dict):
+            part_type = part.get("type")
+            if part_type == "input_text":
+                text = part.get("text", "")
+                if text:
+                    parts.append(text)
+            elif part_type == "input_image":
+                image_count += 1
+            elif part_type == "input_file":
+                file_count += 1
+            elif part_type == "input_audio":
+                audio_count += 1
+            else:
+                # Unknown type, try to stringify it.
+                try:
+                    parts.append(json.dumps(part, ensure_ascii=False, default=str))
+                except TypeError:
+                    parts.append(str(part))
+        elif isinstance(part, str):
+            parts.append(part)
+        else:
+            parts.append(str(part))
+
+    # Add indicators for multimodal content.
+    if image_count > 0:
+        parts.append(f"[{image_count} image(s) attached]")
+    if file_count > 0:
+        parts.append(f"[{file_count} file(s) attached]")
+    if audio_count > 0:
+        parts.append(f"[{audio_count} audio file(s) attached]")
+
+    return " ".join(parts)
+
+
 def _flatten_nested_history_messages(
     items: list[TResponseInputItem],
 ) -> list[TResponseInputItem]:
@@ -234,3 +305,37 @@ def _split_role_and_name(role_text: str) -> tuple[str, str | None]:
 def _get_run_item_role(run_item: RunItem) -> str | None:
     role_candidate = run_item.to_input_item().get("role")
     return role_candidate if isinstance(role_candidate, str) else None
+
+
+def _extract_multimodal_content(
+    transcript: list[TResponseInputItem],
+) -> list[dict[str, Any]]:
+    """Extract multimodal content (images, files, audio) from user messages in the transcript.
+
+    This function scans through all user messages and extracts any multimodal content parts
+    (input_image, input_file, input_audio) so they can be preserved during handoffs.
+
+    Returns:
+        A list of multimodal content items, or an empty list if none found.
+    """
+    multimodal_parts: list[dict[str, Any]] = []
+
+    for item in transcript:
+        # Only extract multimodal content from user messages.
+        role = item.get("role")
+        if role != "user":
+            continue
+
+        content = item.get("content")
+        if content is None:
+            continue
+
+        # If content is a list, check each part for multimodal types.
+        if isinstance(content, list):
+            for part in content:
+                if isinstance(part, dict):
+                    part_type = part.get("type")
+                    if part_type in _MULTIMODAL_CONTENT_TYPES:
+                        multimodal_parts.append(deepcopy(part))
+
+    return multimodal_parts
diff --git a/tests/test_extension_filters.py b/tests/test_extension_filters.py
index 86161bbb7..0eea2340b 100644
--- a/tests/test_extension_filters.py
+++ b/tests/test_extension_filters.py
@@ -398,3 +398,216 @@ def map_history(items: list[TResponseInputItem]) -> list[TResponseInputItem]:
     )
     assert second["role"] == "user"
     assert second["content"] == "Hello"
+
+
+def _get_user_input_item_with_image(text: str, image_url: str) -> TResponseInputItem:
+    """Create a user input item with both text and an image."""
+    return {
+        "role": "user",
+        "content": [
+            {"type": "input_text", "text": text},
+            {"type": "input_image", "image_url": image_url, "detail": "auto"},
+        ],
+    }
+
+
+def _get_user_input_item_with_file(text: str, file_data: str) -> TResponseInputItem:
+    """Create a user input item with both text and a file."""
+    return {
+        "role": "user",
+        "content": [
+            {"type": "input_text", "text": text},
+            {"type": "input_file", "file_data": file_data, "filename": "test.txt"},
+        ],
+    }
+
+
+def _get_user_input_item_image_only(image_url: str) -> TResponseInputItem:
+    """Create a user input item with only an image (no text)."""
+    return {
+        "role": "user",
+        "content": [
+            {"type": "input_image", "image_url": image_url, "detail": "high"},
+        ],
+    }
+
+
+def test_nest_handoff_history_preserves_image_content() -> None:
+    """Test that image content from user messages is preserved during handoff."""
+    image_url = "https://example.com/test-image.jpg"
+    data = HandoffInputData(
+        input_history=(_get_user_input_item_with_image("What's in this image?", image_url),),
+        pre_handoff_items=(_get_message_output_run_item("I see an image"),),
+        new_items=(),
+        run_context=RunContextWrapper(context=()),
+    )
+
+    nested = nest_handoff_history(data)
+
+    assert isinstance(nested.input_history, tuple)
+    # Should have 2 items: summary message + user message with image.
+    assert len(nested.input_history) == 2
+
+    # First item should be the summary.
+    summary = _as_message(nested.input_history[0])
+    assert summary["role"] == "assistant"
+    summary_content = summary["content"]
+    assert isinstance(summary_content, str)
+    assert "What's in this image?" in summary_content
+    assert "[1 image(s) attached]" in summary_content
+
+    # Second item should be the preserved image content.
+    image_msg = _as_message(nested.input_history[1])
+    assert image_msg["role"] == "user"
+    image_content = image_msg["content"]
+    assert isinstance(image_content, list)
+    assert len(image_content) == 1
+    assert image_content[0]["type"] == "input_image"
+    assert image_content[0]["image_url"] == image_url
+    assert image_content[0]["detail"] == "auto"
+
+
+def test_nest_handoff_history_preserves_file_content() -> None:
+    """Test that file content from user messages is preserved during handoff."""
+    file_data = "base64encodeddata"
+    data = HandoffInputData(
+        input_history=(_get_user_input_item_with_file("Analyze this file", file_data),),
+        pre_handoff_items=(_get_message_output_run_item("Analyzing file"),),
+        new_items=(),
+        run_context=RunContextWrapper(context=()),
+    )
+
+    nested = nest_handoff_history(data)
+
+    assert isinstance(nested.input_history, tuple)
+    assert len(nested.input_history) == 2
+
+    # First item should be the summary.
+    summary = _as_message(nested.input_history[0])
+    summary_content = summary["content"]
+    assert isinstance(summary_content, str)
+    assert "[1 file(s) attached]" in summary_content
+
+    # Second item should be the preserved file content.
+    file_msg = _as_message(nested.input_history[1])
+    assert file_msg["role"] == "user"
+    file_content = file_msg["content"]
+    assert isinstance(file_content, list)
+    assert len(file_content) == 1
+    assert file_content[0]["type"] == "input_file"
+    assert file_content[0]["file_data"] == file_data
+
+
+def test_nest_handoff_history_preserves_multiple_images() -> None:
+    """Test that multiple images from different user messages are preserved."""
+    image_url1 = "https://example.com/image1.jpg"
+    image_url2 = "https://example.com/image2.jpg"
+    data = HandoffInputData(
+        input_history=(
+            _get_user_input_item_image_only(image_url1),
+            _get_user_input_item_image_only(image_url2),
+        ),
+        pre_handoff_items=(_get_message_output_run_item("Two images received"),),
+        new_items=(),
+        run_context=RunContextWrapper(context=()),
+    )
+
+    nested = nest_handoff_history(data)
+
+    assert isinstance(nested.input_history, tuple)
+    assert len(nested.input_history) == 2
+
+    # Second item should contain both images.
+    image_msg = _as_message(nested.input_history[1])
+    assert image_msg["role"] == "user"
+    image_content = image_msg["content"]
+    assert isinstance(image_content, list)
+    assert len(image_content) == 2
+    assert image_content[0]["type"] == "input_image"
+    assert image_content[0]["image_url"] == image_url1
+    assert image_content[1]["type"] == "input_image"
+    assert image_content[1]["image_url"] == image_url2
+
+
+def test_nest_handoff_history_no_multimodal_single_message() -> None:
+    """Test that text-only messages result in a single summary message."""
+    data = HandoffInputData(
+        input_history=(_get_user_input_item("Hello, how are you?"),),
+        pre_handoff_items=(_get_message_output_run_item("I am fine"),),
+        new_items=(),
+        run_context=RunContextWrapper(context=()),
+    )
+
+    nested = nest_handoff_history(data)
+
+    assert isinstance(nested.input_history, tuple)
+    # Should only have 1 item (no multimodal content).
+    assert len(nested.input_history) == 1
+    summary = _as_message(nested.input_history[0])
+    assert summary["role"] == "assistant"
+
+
+def test_nest_handoff_history_ignores_multimodal_in_assistant_messages() -> None:
+    """Test that multimodal content in non-user messages is not extracted.
+
+    Only user-uploaded content should be preserved, not content from assistant responses.
+    """
+    # Create an assistant message that somehow has image content.
+    assistant_with_image: TResponseInputItem = {  # type: ignore[misc,assignment]
+        "role": "assistant",
+        "content": [
+            {"type": "output_text", "text": "Here is the image"},
+            {"type": "input_image", "image_url": "https://example.com/generated.jpg"},
+        ],
+    }
+    data = HandoffInputData(
+        input_history=(assistant_with_image,),
+        pre_handoff_items=(),
+        new_items=(),
+        run_context=RunContextWrapper(context=()),
+    )
+
+    nested = nest_handoff_history(data)
+
+    assert isinstance(nested.input_history, tuple)
+    # Should only have 1 item - no additional user message with multimodal content.
+    assert len(nested.input_history) == 1
+    summary = _as_message(nested.input_history[0])
+    assert summary["role"] == "assistant"
+
+
+def test_nest_handoff_history_preserves_audio_content() -> None:
+    """Test that audio content from user messages is preserved during handoff."""
+    audio_data = "base64audiocontent"
+    user_with_audio: TResponseInputItem = {  # type: ignore[misc,assignment]
+        "role": "user",
+        "content": [
+            {"type": "input_text", "text": "Listen to this"},
+            {"type": "input_audio", "input_audio": {"data": audio_data, "format": "mp3"}},
+        ],
+    }
+    data = HandoffInputData(
+        input_history=(user_with_audio,),
+        pre_handoff_items=(_get_message_output_run_item("Audio received"),),
+        new_items=(),
+        run_context=RunContextWrapper(context=()),
+    )
+
+    nested = nest_handoff_history(data)
+
+    assert isinstance(nested.input_history, tuple)
+    assert len(nested.input_history) == 2
+
+    # Check summary mentions audio.
+    summary = _as_message(nested.input_history[0])
+    summary_content = summary["content"]
+    assert isinstance(summary_content, str)
+    assert "[1 audio file(s) attached]" in summary_content
+
+    # Check audio is preserved.
+    audio_msg = _as_message(nested.input_history[1])
+    assert audio_msg["role"] == "user"
+    audio_content = audio_msg["content"]
+    assert isinstance(audio_content, list)
+    assert len(audio_content) == 1
+    assert audio_content[0]["type"] == "input_audio"

From 0de4358a696f270d0e9bd88feeeaa3891e79672f Mon Sep 17 00:00:00 2001
From: Saksham Singh Rathore <sakshamsinghrathore1304@gmail.com>
Date: Thu, 25 Dec 2025 19:17:33 +0530
Subject: [PATCH 2/3] fix: prevent duplicate multimodal content in chained
 handoffs

Add marker system to track preserved multimodal content:
- Add _PRESERVED_MULTIMODAL_MARKER constant to mark preserved messages
- Skip already-preserved messages during extraction
- Add _collect_preserved_multimodal_content() to carry forward existing
  preserved content across chained handoffs
- Add test for chained handoffs scenario

This addresses the P1 code review feedback about duplicate conversation
turns across chained handoffs.
---
 src/agents/handoffs/history.py  | 62 +++++++++++++++++++++++++++++++--
 tests/test_extension_filters.py | 62 +++++++++++++++++++++++++++++++++
 2 files changed, 121 insertions(+), 3 deletions(-)

diff --git a/src/agents/handoffs/history.py b/src/agents/handoffs/history.py
index 14f35dfa0..0ed93495c 100644
--- a/src/agents/handoffs/history.py
+++ b/src/agents/handoffs/history.py
@@ -25,6 +25,10 @@
 # preserved during handoffs rather than being converted to text summaries.
 _MULTIMODAL_CONTENT_TYPES = frozenset({"input_image", "input_file", "input_audio"})
 
+# Marker name used to identify user messages that contain preserved multimodal content
+# from a previous handoff. This prevents re-extraction and duplication across chained handoffs.
+_PRESERVED_MULTIMODAL_MARKER = "__multimodal_preserved__"
+
 _DEFAULT_CONVERSATION_HISTORY_START = "<CONVERSATION HISTORY>"
 _DEFAULT_CONVERSATION_HISTORY_END = "</CONVERSATION HISTORY>"
 _conversation_history_start = _DEFAULT_CONVERSATION_HISTORY_START
@@ -101,17 +105,28 @@ def default_handoff_history_mapper(
     2. A user message with any multimodal content (images, files, audio) if present
 
     This ensures that multimodal content uploaded by users is preserved during handoffs.
+    Multimodal content is only extracted once and carried forward across chained handoffs.
     """
-    multimodal_content = _extract_multimodal_content(transcript)
+    # Extract NEW multimodal content from user messages (excludes already-preserved content).
+    new_multimodal_content = _extract_multimodal_content(transcript)
+
+    # Also collect any already-preserved multimodal content from previous handoffs.
+    existing_multimodal_content = _collect_preserved_multimodal_content(transcript)
+
+    # Combine new and existing multimodal content.
+    all_multimodal_content = existing_multimodal_content + new_multimodal_content
+
     summary_message = _build_summary_message(transcript)
 
     result: list[TResponseInputItem] = [summary_message]
 
     # If there's multimodal content, add it as a user message so the next agent can see it.
-    if multimodal_content:
+    # Mark it with a special name to prevent re-extraction in subsequent handoffs.
+    if all_multimodal_content:
         user_message: dict[str, Any] = {
             "role": "user",
-            "content": multimodal_content,
+            "name": _PRESERVED_MULTIMODAL_MARKER,
+            "content": all_multimodal_content,
         }
         result.append(cast(TResponseInputItem, user_message))
 
@@ -326,6 +341,12 @@ def _extract_multimodal_content(
         if role != "user":
             continue
 
+        # Skip messages that are already preserved multimodal content from a previous handoff.
+        # This prevents duplication across chained handoffs.
+        name = item.get("name")
+        if name == _PRESERVED_MULTIMODAL_MARKER:
+            continue
+
         content = item.get("content")
         if content is None:
             continue
@@ -339,3 +360,38 @@ def _extract_multimodal_content(
                         multimodal_parts.append(deepcopy(part))
 
     return multimodal_parts
+
+
+def _collect_preserved_multimodal_content(
+    transcript: list[TResponseInputItem],
+) -> list[dict[str, Any]]:
+    """Collect multimodal content from messages already marked as preserved.
+
+    This function finds user messages marked with the preservation marker from previous
+    handoffs and collects their content to carry forward.
+
+    Returns:
+        A list of multimodal content items from preserved messages, or an empty list if none.
+    """
+    preserved_parts: list[dict[str, Any]] = []
+
+    for item in transcript:
+        role = item.get("role")
+        if role != "user":
+            continue
+
+        name = item.get("name")
+        if name != _PRESERVED_MULTIMODAL_MARKER:
+            continue
+
+        content = item.get("content")
+        if content is None:
+            continue
+
+        # The preserved message content is a list of multimodal items.
+        if isinstance(content, list):
+            for part in content:
+                if isinstance(part, dict):
+                    preserved_parts.append(deepcopy(part))
+
+    return preserved_parts
diff --git a/tests/test_extension_filters.py b/tests/test_extension_filters.py
index 0eea2340b..4b16c9ed9 100644
--- a/tests/test_extension_filters.py
+++ b/tests/test_extension_filters.py
@@ -611,3 +611,65 @@ def test_nest_handoff_history_preserves_audio_content() -> None:
     assert isinstance(audio_content, list)
     assert len(audio_content) == 1
     assert audio_content[0]["type"] == "input_audio"
+
+
+def test_nest_handoff_history_no_duplicate_on_chained_handoffs() -> None:
+    """Test that multimodal content is not duplicated across chained handoffs.
+
+    When an agent hands off to another agent, and that agent hands off again,
+    the multimodal content should only appear once, not be re-extracted and duplicated.
+    """
+    image_url = "https://example.com/test-image.jpg"
+
+    # First handoff: user sends image, agent responds and hands off.
+    first_data = HandoffInputData(
+        input_history=(_get_user_input_item_with_image("What's in this image?", image_url),),
+        pre_handoff_items=(_get_message_output_run_item("Let me hand this off"),),
+        new_items=(),
+        run_context=RunContextWrapper(context=()),
+    )
+    first_nested = nest_handoff_history(first_data)
+
+    # Verify first handoff has 2 items: summary + preserved image.
+    assert len(first_nested.input_history) == 2
+    first_preserved = _as_message(first_nested.input_history[1])
+    assert first_preserved["role"] == "user"
+    first_content = first_preserved["content"]
+    assert isinstance(first_content, list)
+    assert len(first_content) == 1
+    assert first_content[0]["type"] == "input_image"
+
+    # Second handoff: the new agent responds and hands off again.
+    # The input_history now contains the result from the first handoff.
+    second_data = HandoffInputData(
+        input_history=first_nested.input_history,
+        pre_handoff_items=(_get_message_output_run_item("Handing off again"),),
+        new_items=(),
+        run_context=RunContextWrapper(context=()),
+    )
+    second_nested = nest_handoff_history(second_data)
+
+    # The second handoff should still only have 2 items, not 3.
+    # The preserved image from the first handoff should not be re-extracted.
+    assert len(second_nested.input_history) == 2
+
+    # Verify the image is still preserved (only once).
+    second_preserved = _as_message(second_nested.input_history[1])
+    assert second_preserved["role"] == "user"
+    second_content = second_preserved["content"]
+    assert isinstance(second_content, list)
+    assert len(second_content) == 1
+    assert second_content[0]["type"] == "input_image"
+    assert second_content[0]["image_url"] == image_url
+
+    # Third handoff: verify it still doesn't duplicate.
+    third_data = HandoffInputData(
+        input_history=second_nested.input_history,
+        pre_handoff_items=(_get_message_output_run_item("One more handoff"),),
+        new_items=(),
+        run_context=RunContextWrapper(context=()),
+    )
+    third_nested = nest_handoff_history(third_data)
+
+    # Still only 2 items after three handoffs.
+    assert len(third_nested.input_history) == 2

From 89e8acb746821f23434f188ad59c61cf59a7e8f1 Mon Sep 17 00:00:00 2001
From: Saksham Singh Rathore <sakshamsinghrathore1304@gmail.com>
Date: Sat, 27 Dec 2025 11:27:18 +0530
Subject: [PATCH 3/3] fix tests

---
 tests/test_extension_filters.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_extension_filters.py b/tests/test_extension_filters.py
index 4b16c9ed9..2c6104a5a 100644
--- a/tests/test_extension_filters.py
+++ b/tests/test_extension_filters.py
@@ -632,6 +632,7 @@ def test_nest_handoff_history_no_duplicate_on_chained_handoffs() -> None:
 
     # Verify first handoff has 2 items: summary + preserved image.
     assert len(first_nested.input_history) == 2
+    assert not isinstance(first_nested.input_history, str)
     first_preserved = _as_message(first_nested.input_history[1])
     assert first_preserved["role"] == "user"
     first_content = first_preserved["content"]
@@ -652,6 +653,7 @@ def test_nest_handoff_history_no_duplicate_on_chained_handoffs() -> None:
     # The second handoff should still only have 2 items, not 3.
     # The preserved image from the first handoff should not be re-extracted.
     assert len(second_nested.input_history) == 2
+    assert not isinstance(second_nested.input_history, str)
 
     # Verify the image is still preserved (only once).
     second_preserved = _as_message(second_nested.input_history[1])