From 33b694afc373e7bbda1c8e8ea34d3111f74cfbe2 Mon Sep 17 00:00:00 2001 From: Saksham Singh Rathore Date: Thu, 25 Dec 2025 19:02:28 +0530 Subject: [PATCH 1/3] fix: preserve multimodal content during handoffs with nest_handoff_history When nest_handoff_history=True (the default), multimodal content (images, files, audio) from user messages was being lost during handoffs because the content was converted to a plain text summary. This fix: - Extracts multimodal content from user messages before summarization - Adds the multimodal content as a separate user message after the summary - Improves the text summary to show '[N image(s) attached]' instead of raw JSON Fixes issue where target agents lose access to uploaded images during same-turn handoffs. --- src/agents/handoffs/history.py | 109 +++++++++++++++- tests/test_extension_filters.py | 213 ++++++++++++++++++++++++++++++++ 2 files changed, 320 insertions(+), 2 deletions(-) diff --git a/src/agents/handoffs/history.py b/src/agents/handoffs/history.py index dc59547fb..14f35dfa0 100644 --- a/src/agents/handoffs/history.py +++ b/src/agents/handoffs/history.py @@ -21,6 +21,10 @@ "set_conversation_history_wrappers", ] +# Content types that represent multimodal data (images, files, audio) which should be +# preserved during handoffs rather than being converted to text summaries. +_MULTIMODAL_CONTENT_TYPES = frozenset({"input_image", "input_file", "input_audio"}) + _DEFAULT_CONVERSATION_HISTORY_START = "" _DEFAULT_CONVERSATION_HISTORY_END = "" _conversation_history_start = _DEFAULT_CONVERSATION_HISTORY_START @@ -90,10 +94,28 @@ def nest_handoff_history( def default_handoff_history_mapper( transcript: list[TResponseInputItem], ) -> list[TResponseInputItem]: - """Return a single assistant message summarizing the transcript.""" + """Return a summary of the transcript, preserving multimodal content. + + The returned list contains: + 1. An assistant message summarizing the text conversation + 2. A user message with any multimodal content (images, files, audio) if present + This ensures that multimodal content uploaded by users is preserved during handoffs. + """ + multimodal_content = _extract_multimodal_content(transcript) summary_message = _build_summary_message(transcript) - return [summary_message] + + result: list[TResponseInputItem] = [summary_message] + + # If there's multimodal content, add it as a user message so the next agent can see it. + if multimodal_content: + user_message: dict[str, Any] = { + "role": "user", + "content": multimodal_content, + } + result.append(cast(TResponseInputItem, user_message)) + + return result def _normalize_input_history( @@ -157,12 +179,61 @@ def _stringify_content(content: Any) -> str: return "" if isinstance(content, str): return content + # Handle multimodal content (list of content parts). + if isinstance(content, list): + return _stringify_content_list(content) try: return json.dumps(content, ensure_ascii=False, default=str) except TypeError: return str(content) +def _stringify_content_list(content_list: list[Any]) -> str: + """Convert a list of content parts to a human-readable string. + + For multimodal content, this provides a summary that indicates the presence + of images, files, and audio without including their binary data. + """ + parts: list[str] = [] + image_count = 0 + file_count = 0 + audio_count = 0 + + for part in content_list: + if isinstance(part, dict): + part_type = part.get("type") + if part_type == "input_text": + text = part.get("text", "") + if text: + parts.append(text) + elif part_type == "input_image": + image_count += 1 + elif part_type == "input_file": + file_count += 1 + elif part_type == "input_audio": + audio_count += 1 + else: + # Unknown type, try to stringify it. + try: + parts.append(json.dumps(part, ensure_ascii=False, default=str)) + except TypeError: + parts.append(str(part)) + elif isinstance(part, str): + parts.append(part) + else: + parts.append(str(part)) + + # Add indicators for multimodal content. + if image_count > 0: + parts.append(f"[{image_count} image(s) attached]") + if file_count > 0: + parts.append(f"[{file_count} file(s) attached]") + if audio_count > 0: + parts.append(f"[{audio_count} audio file(s) attached]") + + return " ".join(parts) + + def _flatten_nested_history_messages( items: list[TResponseInputItem], ) -> list[TResponseInputItem]: @@ -234,3 +305,37 @@ def _split_role_and_name(role_text: str) -> tuple[str, str | None]: def _get_run_item_role(run_item: RunItem) -> str | None: role_candidate = run_item.to_input_item().get("role") return role_candidate if isinstance(role_candidate, str) else None + + +def _extract_multimodal_content( + transcript: list[TResponseInputItem], +) -> list[dict[str, Any]]: + """Extract multimodal content (images, files, audio) from user messages in the transcript. + + This function scans through all user messages and extracts any multimodal content parts + (input_image, input_file, input_audio) so they can be preserved during handoffs. + + Returns: + A list of multimodal content items, or an empty list if none found. + """ + multimodal_parts: list[dict[str, Any]] = [] + + for item in transcript: + # Only extract multimodal content from user messages. + role = item.get("role") + if role != "user": + continue + + content = item.get("content") + if content is None: + continue + + # If content is a list, check each part for multimodal types. + if isinstance(content, list): + for part in content: + if isinstance(part, dict): + part_type = part.get("type") + if part_type in _MULTIMODAL_CONTENT_TYPES: + multimodal_parts.append(deepcopy(part)) + + return multimodal_parts diff --git a/tests/test_extension_filters.py b/tests/test_extension_filters.py index 86161bbb7..0eea2340b 100644 --- a/tests/test_extension_filters.py +++ b/tests/test_extension_filters.py @@ -398,3 +398,216 @@ def map_history(items: list[TResponseInputItem]) -> list[TResponseInputItem]: ) assert second["role"] == "user" assert second["content"] == "Hello" + + +def _get_user_input_item_with_image(text: str, image_url: str) -> TResponseInputItem: + """Create a user input item with both text and an image.""" + return { + "role": "user", + "content": [ + {"type": "input_text", "text": text}, + {"type": "input_image", "image_url": image_url, "detail": "auto"}, + ], + } + + +def _get_user_input_item_with_file(text: str, file_data: str) -> TResponseInputItem: + """Create a user input item with both text and a file.""" + return { + "role": "user", + "content": [ + {"type": "input_text", "text": text}, + {"type": "input_file", "file_data": file_data, "filename": "test.txt"}, + ], + } + + +def _get_user_input_item_image_only(image_url: str) -> TResponseInputItem: + """Create a user input item with only an image (no text).""" + return { + "role": "user", + "content": [ + {"type": "input_image", "image_url": image_url, "detail": "high"}, + ], + } + + +def test_nest_handoff_history_preserves_image_content() -> None: + """Test that image content from user messages is preserved during handoff.""" + image_url = "https://example.com/test-image.jpg" + data = HandoffInputData( + input_history=(_get_user_input_item_with_image("What's in this image?", image_url),), + pre_handoff_items=(_get_message_output_run_item("I see an image"),), + new_items=(), + run_context=RunContextWrapper(context=()), + ) + + nested = nest_handoff_history(data) + + assert isinstance(nested.input_history, tuple) + # Should have 2 items: summary message + user message with image. + assert len(nested.input_history) == 2 + + # First item should be the summary. + summary = _as_message(nested.input_history[0]) + assert summary["role"] == "assistant" + summary_content = summary["content"] + assert isinstance(summary_content, str) + assert "What's in this image?" in summary_content + assert "[1 image(s) attached]" in summary_content + + # Second item should be the preserved image content. + image_msg = _as_message(nested.input_history[1]) + assert image_msg["role"] == "user" + image_content = image_msg["content"] + assert isinstance(image_content, list) + assert len(image_content) == 1 + assert image_content[0]["type"] == "input_image" + assert image_content[0]["image_url"] == image_url + assert image_content[0]["detail"] == "auto" + + +def test_nest_handoff_history_preserves_file_content() -> None: + """Test that file content from user messages is preserved during handoff.""" + file_data = "base64encodeddata" + data = HandoffInputData( + input_history=(_get_user_input_item_with_file("Analyze this file", file_data),), + pre_handoff_items=(_get_message_output_run_item("Analyzing file"),), + new_items=(), + run_context=RunContextWrapper(context=()), + ) + + nested = nest_handoff_history(data) + + assert isinstance(nested.input_history, tuple) + assert len(nested.input_history) == 2 + + # First item should be the summary. + summary = _as_message(nested.input_history[0]) + summary_content = summary["content"] + assert isinstance(summary_content, str) + assert "[1 file(s) attached]" in summary_content + + # Second item should be the preserved file content. + file_msg = _as_message(nested.input_history[1]) + assert file_msg["role"] == "user" + file_content = file_msg["content"] + assert isinstance(file_content, list) + assert len(file_content) == 1 + assert file_content[0]["type"] == "input_file" + assert file_content[0]["file_data"] == file_data + + +def test_nest_handoff_history_preserves_multiple_images() -> None: + """Test that multiple images from different user messages are preserved.""" + image_url1 = "https://example.com/image1.jpg" + image_url2 = "https://example.com/image2.jpg" + data = HandoffInputData( + input_history=( + _get_user_input_item_image_only(image_url1), + _get_user_input_item_image_only(image_url2), + ), + pre_handoff_items=(_get_message_output_run_item("Two images received"),), + new_items=(), + run_context=RunContextWrapper(context=()), + ) + + nested = nest_handoff_history(data) + + assert isinstance(nested.input_history, tuple) + assert len(nested.input_history) == 2 + + # Second item should contain both images. + image_msg = _as_message(nested.input_history[1]) + assert image_msg["role"] == "user" + image_content = image_msg["content"] + assert isinstance(image_content, list) + assert len(image_content) == 2 + assert image_content[0]["type"] == "input_image" + assert image_content[0]["image_url"] == image_url1 + assert image_content[1]["type"] == "input_image" + assert image_content[1]["image_url"] == image_url2 + + +def test_nest_handoff_history_no_multimodal_single_message() -> None: + """Test that text-only messages result in a single summary message.""" + data = HandoffInputData( + input_history=(_get_user_input_item("Hello, how are you?"),), + pre_handoff_items=(_get_message_output_run_item("I am fine"),), + new_items=(), + run_context=RunContextWrapper(context=()), + ) + + nested = nest_handoff_history(data) + + assert isinstance(nested.input_history, tuple) + # Should only have 1 item (no multimodal content). + assert len(nested.input_history) == 1 + summary = _as_message(nested.input_history[0]) + assert summary["role"] == "assistant" + + +def test_nest_handoff_history_ignores_multimodal_in_assistant_messages() -> None: + """Test that multimodal content in non-user messages is not extracted. + + Only user-uploaded content should be preserved, not content from assistant responses. + """ + # Create an assistant message that somehow has image content. + assistant_with_image: TResponseInputItem = { # type: ignore[misc,assignment] + "role": "assistant", + "content": [ + {"type": "output_text", "text": "Here is the image"}, + {"type": "input_image", "image_url": "https://example.com/generated.jpg"}, + ], + } + data = HandoffInputData( + input_history=(assistant_with_image,), + pre_handoff_items=(), + new_items=(), + run_context=RunContextWrapper(context=()), + ) + + nested = nest_handoff_history(data) + + assert isinstance(nested.input_history, tuple) + # Should only have 1 item - no additional user message with multimodal content. + assert len(nested.input_history) == 1 + summary = _as_message(nested.input_history[0]) + assert summary["role"] == "assistant" + + +def test_nest_handoff_history_preserves_audio_content() -> None: + """Test that audio content from user messages is preserved during handoff.""" + audio_data = "base64audiocontent" + user_with_audio: TResponseInputItem = { # type: ignore[misc,assignment] + "role": "user", + "content": [ + {"type": "input_text", "text": "Listen to this"}, + {"type": "input_audio", "input_audio": {"data": audio_data, "format": "mp3"}}, + ], + } + data = HandoffInputData( + input_history=(user_with_audio,), + pre_handoff_items=(_get_message_output_run_item("Audio received"),), + new_items=(), + run_context=RunContextWrapper(context=()), + ) + + nested = nest_handoff_history(data) + + assert isinstance(nested.input_history, tuple) + assert len(nested.input_history) == 2 + + # Check summary mentions audio. + summary = _as_message(nested.input_history[0]) + summary_content = summary["content"] + assert isinstance(summary_content, str) + assert "[1 audio file(s) attached]" in summary_content + + # Check audio is preserved. + audio_msg = _as_message(nested.input_history[1]) + assert audio_msg["role"] == "user" + audio_content = audio_msg["content"] + assert isinstance(audio_content, list) + assert len(audio_content) == 1 + assert audio_content[0]["type"] == "input_audio" From 0de4358a696f270d0e9bd88feeeaa3891e79672f Mon Sep 17 00:00:00 2001 From: Saksham Singh Rathore Date: Thu, 25 Dec 2025 19:17:33 +0530 Subject: [PATCH 2/3] fix: prevent duplicate multimodal content in chained handoffs Add marker system to track preserved multimodal content: - Add _PRESERVED_MULTIMODAL_MARKER constant to mark preserved messages - Skip already-preserved messages during extraction - Add _collect_preserved_multimodal_content() to carry forward existing preserved content across chained handoffs - Add test for chained handoffs scenario This addresses the P1 code review feedback about duplicate conversation turns across chained handoffs. --- src/agents/handoffs/history.py | 62 +++++++++++++++++++++++++++++++-- tests/test_extension_filters.py | 62 +++++++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+), 3 deletions(-) diff --git a/src/agents/handoffs/history.py b/src/agents/handoffs/history.py index 14f35dfa0..0ed93495c 100644 --- a/src/agents/handoffs/history.py +++ b/src/agents/handoffs/history.py @@ -25,6 +25,10 @@ # preserved during handoffs rather than being converted to text summaries. _MULTIMODAL_CONTENT_TYPES = frozenset({"input_image", "input_file", "input_audio"}) +# Marker name used to identify user messages that contain preserved multimodal content +# from a previous handoff. This prevents re-extraction and duplication across chained handoffs. +_PRESERVED_MULTIMODAL_MARKER = "__multimodal_preserved__" + _DEFAULT_CONVERSATION_HISTORY_START = "" _DEFAULT_CONVERSATION_HISTORY_END = "" _conversation_history_start = _DEFAULT_CONVERSATION_HISTORY_START @@ -101,17 +105,28 @@ def default_handoff_history_mapper( 2. A user message with any multimodal content (images, files, audio) if present This ensures that multimodal content uploaded by users is preserved during handoffs. + Multimodal content is only extracted once and carried forward across chained handoffs. """ - multimodal_content = _extract_multimodal_content(transcript) + # Extract NEW multimodal content from user messages (excludes already-preserved content). + new_multimodal_content = _extract_multimodal_content(transcript) + + # Also collect any already-preserved multimodal content from previous handoffs. + existing_multimodal_content = _collect_preserved_multimodal_content(transcript) + + # Combine new and existing multimodal content. + all_multimodal_content = existing_multimodal_content + new_multimodal_content + summary_message = _build_summary_message(transcript) result: list[TResponseInputItem] = [summary_message] # If there's multimodal content, add it as a user message so the next agent can see it. - if multimodal_content: + # Mark it with a special name to prevent re-extraction in subsequent handoffs. + if all_multimodal_content: user_message: dict[str, Any] = { "role": "user", - "content": multimodal_content, + "name": _PRESERVED_MULTIMODAL_MARKER, + "content": all_multimodal_content, } result.append(cast(TResponseInputItem, user_message)) @@ -326,6 +341,12 @@ def _extract_multimodal_content( if role != "user": continue + # Skip messages that are already preserved multimodal content from a previous handoff. + # This prevents duplication across chained handoffs. + name = item.get("name") + if name == _PRESERVED_MULTIMODAL_MARKER: + continue + content = item.get("content") if content is None: continue @@ -339,3 +360,38 @@ def _extract_multimodal_content( multimodal_parts.append(deepcopy(part)) return multimodal_parts + + +def _collect_preserved_multimodal_content( + transcript: list[TResponseInputItem], +) -> list[dict[str, Any]]: + """Collect multimodal content from messages already marked as preserved. + + This function finds user messages marked with the preservation marker from previous + handoffs and collects their content to carry forward. + + Returns: + A list of multimodal content items from preserved messages, or an empty list if none. + """ + preserved_parts: list[dict[str, Any]] = [] + + for item in transcript: + role = item.get("role") + if role != "user": + continue + + name = item.get("name") + if name != _PRESERVED_MULTIMODAL_MARKER: + continue + + content = item.get("content") + if content is None: + continue + + # The preserved message content is a list of multimodal items. + if isinstance(content, list): + for part in content: + if isinstance(part, dict): + preserved_parts.append(deepcopy(part)) + + return preserved_parts diff --git a/tests/test_extension_filters.py b/tests/test_extension_filters.py index 0eea2340b..4b16c9ed9 100644 --- a/tests/test_extension_filters.py +++ b/tests/test_extension_filters.py @@ -611,3 +611,65 @@ def test_nest_handoff_history_preserves_audio_content() -> None: assert isinstance(audio_content, list) assert len(audio_content) == 1 assert audio_content[0]["type"] == "input_audio" + + +def test_nest_handoff_history_no_duplicate_on_chained_handoffs() -> None: + """Test that multimodal content is not duplicated across chained handoffs. + + When an agent hands off to another agent, and that agent hands off again, + the multimodal content should only appear once, not be re-extracted and duplicated. + """ + image_url = "https://example.com/test-image.jpg" + + # First handoff: user sends image, agent responds and hands off. + first_data = HandoffInputData( + input_history=(_get_user_input_item_with_image("What's in this image?", image_url),), + pre_handoff_items=(_get_message_output_run_item("Let me hand this off"),), + new_items=(), + run_context=RunContextWrapper(context=()), + ) + first_nested = nest_handoff_history(first_data) + + # Verify first handoff has 2 items: summary + preserved image. + assert len(first_nested.input_history) == 2 + first_preserved = _as_message(first_nested.input_history[1]) + assert first_preserved["role"] == "user" + first_content = first_preserved["content"] + assert isinstance(first_content, list) + assert len(first_content) == 1 + assert first_content[0]["type"] == "input_image" + + # Second handoff: the new agent responds and hands off again. + # The input_history now contains the result from the first handoff. + second_data = HandoffInputData( + input_history=first_nested.input_history, + pre_handoff_items=(_get_message_output_run_item("Handing off again"),), + new_items=(), + run_context=RunContextWrapper(context=()), + ) + second_nested = nest_handoff_history(second_data) + + # The second handoff should still only have 2 items, not 3. + # The preserved image from the first handoff should not be re-extracted. + assert len(second_nested.input_history) == 2 + + # Verify the image is still preserved (only once). + second_preserved = _as_message(second_nested.input_history[1]) + assert second_preserved["role"] == "user" + second_content = second_preserved["content"] + assert isinstance(second_content, list) + assert len(second_content) == 1 + assert second_content[0]["type"] == "input_image" + assert second_content[0]["image_url"] == image_url + + # Third handoff: verify it still doesn't duplicate. + third_data = HandoffInputData( + input_history=second_nested.input_history, + pre_handoff_items=(_get_message_output_run_item("One more handoff"),), + new_items=(), + run_context=RunContextWrapper(context=()), + ) + third_nested = nest_handoff_history(third_data) + + # Still only 2 items after three handoffs. + assert len(third_nested.input_history) == 2 From 89e8acb746821f23434f188ad59c61cf59a7e8f1 Mon Sep 17 00:00:00 2001 From: Saksham Singh Rathore Date: Sat, 27 Dec 2025 11:27:18 +0530 Subject: [PATCH 3/3] fix tests --- tests/test_extension_filters.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_extension_filters.py b/tests/test_extension_filters.py index 4b16c9ed9..2c6104a5a 100644 --- a/tests/test_extension_filters.py +++ b/tests/test_extension_filters.py @@ -632,6 +632,7 @@ def test_nest_handoff_history_no_duplicate_on_chained_handoffs() -> None: # Verify first handoff has 2 items: summary + preserved image. assert len(first_nested.input_history) == 2 + assert not isinstance(first_nested.input_history, str) first_preserved = _as_message(first_nested.input_history[1]) assert first_preserved["role"] == "user" first_content = first_preserved["content"] @@ -652,6 +653,7 @@ def test_nest_handoff_history_no_duplicate_on_chained_handoffs() -> None: # The second handoff should still only have 2 items, not 3. # The preserved image from the first handoff should not be re-extracted. assert len(second_nested.input_history) == 2 + assert not isinstance(second_nested.input_history, str) # Verify the image is still preserved (only once). second_preserved = _as_message(second_nested.input_history[1])