diff --git a/src/agents/handoffs/history.py b/src/agents/handoffs/history.py index dc59547fb..0ed93495c 100644 --- a/src/agents/handoffs/history.py +++ b/src/agents/handoffs/history.py @@ -21,6 +21,14 @@ "set_conversation_history_wrappers", ] +# Content types that represent multimodal data (images, files, audio) which should be +# preserved during handoffs rather than being converted to text summaries. +_MULTIMODAL_CONTENT_TYPES = frozenset({"input_image", "input_file", "input_audio"}) + +# Marker name used to identify user messages that contain preserved multimodal content +# from a previous handoff. This prevents re-extraction and duplication across chained handoffs. +_PRESERVED_MULTIMODAL_MARKER = "__multimodal_preserved__" + _DEFAULT_CONVERSATION_HISTORY_START = "" _DEFAULT_CONVERSATION_HISTORY_END = "" _conversation_history_start = _DEFAULT_CONVERSATION_HISTORY_START @@ -90,10 +98,39 @@ def nest_handoff_history( def default_handoff_history_mapper( transcript: list[TResponseInputItem], ) -> list[TResponseInputItem]: - """Return a single assistant message summarizing the transcript.""" + """Return a summary of the transcript, preserving multimodal content. + + The returned list contains: + 1. An assistant message summarizing the text conversation + 2. A user message with any multimodal content (images, files, audio) if present + + This ensures that multimodal content uploaded by users is preserved during handoffs. + Multimodal content is only extracted once and carried forward across chained handoffs. + """ + # Extract NEW multimodal content from user messages (excludes already-preserved content). + new_multimodal_content = _extract_multimodal_content(transcript) + + # Also collect any already-preserved multimodal content from previous handoffs. + existing_multimodal_content = _collect_preserved_multimodal_content(transcript) + + # Combine new and existing multimodal content. + all_multimodal_content = existing_multimodal_content + new_multimodal_content summary_message = _build_summary_message(transcript) - return [summary_message] + + result: list[TResponseInputItem] = [summary_message] + + # If there's multimodal content, add it as a user message so the next agent can see it. + # Mark it with a special name to prevent re-extraction in subsequent handoffs. + if all_multimodal_content: + user_message: dict[str, Any] = { + "role": "user", + "name": _PRESERVED_MULTIMODAL_MARKER, + "content": all_multimodal_content, + } + result.append(cast(TResponseInputItem, user_message)) + + return result def _normalize_input_history( @@ -157,12 +194,61 @@ def _stringify_content(content: Any) -> str: return "" if isinstance(content, str): return content + # Handle multimodal content (list of content parts). + if isinstance(content, list): + return _stringify_content_list(content) try: return json.dumps(content, ensure_ascii=False, default=str) except TypeError: return str(content) +def _stringify_content_list(content_list: list[Any]) -> str: + """Convert a list of content parts to a human-readable string. + + For multimodal content, this provides a summary that indicates the presence + of images, files, and audio without including their binary data. + """ + parts: list[str] = [] + image_count = 0 + file_count = 0 + audio_count = 0 + + for part in content_list: + if isinstance(part, dict): + part_type = part.get("type") + if part_type == "input_text": + text = part.get("text", "") + if text: + parts.append(text) + elif part_type == "input_image": + image_count += 1 + elif part_type == "input_file": + file_count += 1 + elif part_type == "input_audio": + audio_count += 1 + else: + # Unknown type, try to stringify it. + try: + parts.append(json.dumps(part, ensure_ascii=False, default=str)) + except TypeError: + parts.append(str(part)) + elif isinstance(part, str): + parts.append(part) + else: + parts.append(str(part)) + + # Add indicators for multimodal content. + if image_count > 0: + parts.append(f"[{image_count} image(s) attached]") + if file_count > 0: + parts.append(f"[{file_count} file(s) attached]") + if audio_count > 0: + parts.append(f"[{audio_count} audio file(s) attached]") + + return " ".join(parts) + + def _flatten_nested_history_messages( items: list[TResponseInputItem], ) -> list[TResponseInputItem]: @@ -234,3 +320,78 @@ def _split_role_and_name(role_text: str) -> tuple[str, str | None]: def _get_run_item_role(run_item: RunItem) -> str | None: role_candidate = run_item.to_input_item().get("role") return role_candidate if isinstance(role_candidate, str) else None + + +def _extract_multimodal_content( + transcript: list[TResponseInputItem], +) -> list[dict[str, Any]]: + """Extract multimodal content (images, files, audio) from user messages in the transcript. + + This function scans through all user messages and extracts any multimodal content parts + (input_image, input_file, input_audio) so they can be preserved during handoffs. + + Returns: + A list of multimodal content items, or an empty list if none found. + """ + multimodal_parts: list[dict[str, Any]] = [] + + for item in transcript: + # Only extract multimodal content from user messages. + role = item.get("role") + if role != "user": + continue + + # Skip messages that are already preserved multimodal content from a previous handoff. + # This prevents duplication across chained handoffs. + name = item.get("name") + if name == _PRESERVED_MULTIMODAL_MARKER: + continue + + content = item.get("content") + if content is None: + continue + + # If content is a list, check each part for multimodal types. + if isinstance(content, list): + for part in content: + if isinstance(part, dict): + part_type = part.get("type") + if part_type in _MULTIMODAL_CONTENT_TYPES: + multimodal_parts.append(deepcopy(part)) + + return multimodal_parts + + +def _collect_preserved_multimodal_content( + transcript: list[TResponseInputItem], +) -> list[dict[str, Any]]: + """Collect multimodal content from messages already marked as preserved. + + This function finds user messages marked with the preservation marker from previous + handoffs and collects their content to carry forward. + + Returns: + A list of multimodal content items from preserved messages, or an empty list if none. + """ + preserved_parts: list[dict[str, Any]] = [] + + for item in transcript: + role = item.get("role") + if role != "user": + continue + + name = item.get("name") + if name != _PRESERVED_MULTIMODAL_MARKER: + continue + + content = item.get("content") + if content is None: + continue + + # The preserved message content is a list of multimodal items. + if isinstance(content, list): + for part in content: + if isinstance(part, dict): + preserved_parts.append(deepcopy(part)) + + return preserved_parts diff --git a/tests/test_extension_filters.py b/tests/test_extension_filters.py index 86161bbb7..2c6104a5a 100644 --- a/tests/test_extension_filters.py +++ b/tests/test_extension_filters.py @@ -398,3 +398,280 @@ def map_history(items: list[TResponseInputItem]) -> list[TResponseInputItem]: ) assert second["role"] == "user" assert second["content"] == "Hello" + + +def _get_user_input_item_with_image(text: str, image_url: str) -> TResponseInputItem: + """Create a user input item with both text and an image.""" + return { + "role": "user", + "content": [ + {"type": "input_text", "text": text}, + {"type": "input_image", "image_url": image_url, "detail": "auto"}, + ], + } + + +def _get_user_input_item_with_file(text: str, file_data: str) -> TResponseInputItem: + """Create a user input item with both text and a file.""" + return { + "role": "user", + "content": [ + {"type": "input_text", "text": text}, + {"type": "input_file", "file_data": file_data, "filename": "test.txt"}, + ], + } + + +def _get_user_input_item_image_only(image_url: str) -> TResponseInputItem: + """Create a user input item with only an image (no text).""" + return { + "role": "user", + "content": [ + {"type": "input_image", "image_url": image_url, "detail": "high"}, + ], + } + + +def test_nest_handoff_history_preserves_image_content() -> None: + """Test that image content from user messages is preserved during handoff.""" + image_url = "https://example.com/test-image.jpg" + data = HandoffInputData( + input_history=(_get_user_input_item_with_image("What's in this image?", image_url),), + pre_handoff_items=(_get_message_output_run_item("I see an image"),), + new_items=(), + run_context=RunContextWrapper(context=()), + ) + + nested = nest_handoff_history(data) + + assert isinstance(nested.input_history, tuple) + # Should have 2 items: summary message + user message with image. + assert len(nested.input_history) == 2 + + # First item should be the summary. + summary = _as_message(nested.input_history[0]) + assert summary["role"] == "assistant" + summary_content = summary["content"] + assert isinstance(summary_content, str) + assert "What's in this image?" in summary_content + assert "[1 image(s) attached]" in summary_content + + # Second item should be the preserved image content. + image_msg = _as_message(nested.input_history[1]) + assert image_msg["role"] == "user" + image_content = image_msg["content"] + assert isinstance(image_content, list) + assert len(image_content) == 1 + assert image_content[0]["type"] == "input_image" + assert image_content[0]["image_url"] == image_url + assert image_content[0]["detail"] == "auto" + + +def test_nest_handoff_history_preserves_file_content() -> None: + """Test that file content from user messages is preserved during handoff.""" + file_data = "base64encodeddata" + data = HandoffInputData( + input_history=(_get_user_input_item_with_file("Analyze this file", file_data),), + pre_handoff_items=(_get_message_output_run_item("Analyzing file"),), + new_items=(), + run_context=RunContextWrapper(context=()), + ) + + nested = nest_handoff_history(data) + + assert isinstance(nested.input_history, tuple) + assert len(nested.input_history) == 2 + + # First item should be the summary. + summary = _as_message(nested.input_history[0]) + summary_content = summary["content"] + assert isinstance(summary_content, str) + assert "[1 file(s) attached]" in summary_content + + # Second item should be the preserved file content. + file_msg = _as_message(nested.input_history[1]) + assert file_msg["role"] == "user" + file_content = file_msg["content"] + assert isinstance(file_content, list) + assert len(file_content) == 1 + assert file_content[0]["type"] == "input_file" + assert file_content[0]["file_data"] == file_data + + +def test_nest_handoff_history_preserves_multiple_images() -> None: + """Test that multiple images from different user messages are preserved.""" + image_url1 = "https://example.com/image1.jpg" + image_url2 = "https://example.com/image2.jpg" + data = HandoffInputData( + input_history=( + _get_user_input_item_image_only(image_url1), + _get_user_input_item_image_only(image_url2), + ), + pre_handoff_items=(_get_message_output_run_item("Two images received"),), + new_items=(), + run_context=RunContextWrapper(context=()), + ) + + nested = nest_handoff_history(data) + + assert isinstance(nested.input_history, tuple) + assert len(nested.input_history) == 2 + + # Second item should contain both images. + image_msg = _as_message(nested.input_history[1]) + assert image_msg["role"] == "user" + image_content = image_msg["content"] + assert isinstance(image_content, list) + assert len(image_content) == 2 + assert image_content[0]["type"] == "input_image" + assert image_content[0]["image_url"] == image_url1 + assert image_content[1]["type"] == "input_image" + assert image_content[1]["image_url"] == image_url2 + + +def test_nest_handoff_history_no_multimodal_single_message() -> None: + """Test that text-only messages result in a single summary message.""" + data = HandoffInputData( + input_history=(_get_user_input_item("Hello, how are you?"),), + pre_handoff_items=(_get_message_output_run_item("I am fine"),), + new_items=(), + run_context=RunContextWrapper(context=()), + ) + + nested = nest_handoff_history(data) + + assert isinstance(nested.input_history, tuple) + # Should only have 1 item (no multimodal content). + assert len(nested.input_history) == 1 + summary = _as_message(nested.input_history[0]) + assert summary["role"] == "assistant" + + +def test_nest_handoff_history_ignores_multimodal_in_assistant_messages() -> None: + """Test that multimodal content in non-user messages is not extracted. + + Only user-uploaded content should be preserved, not content from assistant responses. + """ + # Create an assistant message that somehow has image content. + assistant_with_image: TResponseInputItem = { # type: ignore[misc,assignment] + "role": "assistant", + "content": [ + {"type": "output_text", "text": "Here is the image"}, + {"type": "input_image", "image_url": "https://example.com/generated.jpg"}, + ], + } + data = HandoffInputData( + input_history=(assistant_with_image,), + pre_handoff_items=(), + new_items=(), + run_context=RunContextWrapper(context=()), + ) + + nested = nest_handoff_history(data) + + assert isinstance(nested.input_history, tuple) + # Should only have 1 item - no additional user message with multimodal content. + assert len(nested.input_history) == 1 + summary = _as_message(nested.input_history[0]) + assert summary["role"] == "assistant" + + +def test_nest_handoff_history_preserves_audio_content() -> None: + """Test that audio content from user messages is preserved during handoff.""" + audio_data = "base64audiocontent" + user_with_audio: TResponseInputItem = { # type: ignore[misc,assignment] + "role": "user", + "content": [ + {"type": "input_text", "text": "Listen to this"}, + {"type": "input_audio", "input_audio": {"data": audio_data, "format": "mp3"}}, + ], + } + data = HandoffInputData( + input_history=(user_with_audio,), + pre_handoff_items=(_get_message_output_run_item("Audio received"),), + new_items=(), + run_context=RunContextWrapper(context=()), + ) + + nested = nest_handoff_history(data) + + assert isinstance(nested.input_history, tuple) + assert len(nested.input_history) == 2 + + # Check summary mentions audio. + summary = _as_message(nested.input_history[0]) + summary_content = summary["content"] + assert isinstance(summary_content, str) + assert "[1 audio file(s) attached]" in summary_content + + # Check audio is preserved. + audio_msg = _as_message(nested.input_history[1]) + assert audio_msg["role"] == "user" + audio_content = audio_msg["content"] + assert isinstance(audio_content, list) + assert len(audio_content) == 1 + assert audio_content[0]["type"] == "input_audio" + + +def test_nest_handoff_history_no_duplicate_on_chained_handoffs() -> None: + """Test that multimodal content is not duplicated across chained handoffs. + + When an agent hands off to another agent, and that agent hands off again, + the multimodal content should only appear once, not be re-extracted and duplicated. + """ + image_url = "https://example.com/test-image.jpg" + + # First handoff: user sends image, agent responds and hands off. + first_data = HandoffInputData( + input_history=(_get_user_input_item_with_image("What's in this image?", image_url),), + pre_handoff_items=(_get_message_output_run_item("Let me hand this off"),), + new_items=(), + run_context=RunContextWrapper(context=()), + ) + first_nested = nest_handoff_history(first_data) + + # Verify first handoff has 2 items: summary + preserved image. + assert len(first_nested.input_history) == 2 + assert not isinstance(first_nested.input_history, str) + first_preserved = _as_message(first_nested.input_history[1]) + assert first_preserved["role"] == "user" + first_content = first_preserved["content"] + assert isinstance(first_content, list) + assert len(first_content) == 1 + assert first_content[0]["type"] == "input_image" + + # Second handoff: the new agent responds and hands off again. + # The input_history now contains the result from the first handoff. + second_data = HandoffInputData( + input_history=first_nested.input_history, + pre_handoff_items=(_get_message_output_run_item("Handing off again"),), + new_items=(), + run_context=RunContextWrapper(context=()), + ) + second_nested = nest_handoff_history(second_data) + + # The second handoff should still only have 2 items, not 3. + # The preserved image from the first handoff should not be re-extracted. + assert len(second_nested.input_history) == 2 + assert not isinstance(second_nested.input_history, str) + + # Verify the image is still preserved (only once). + second_preserved = _as_message(second_nested.input_history[1]) + assert second_preserved["role"] == "user" + second_content = second_preserved["content"] + assert isinstance(second_content, list) + assert len(second_content) == 1 + assert second_content[0]["type"] == "input_image" + assert second_content[0]["image_url"] == image_url + + # Third handoff: verify it still doesn't duplicate. + third_data = HandoffInputData( + input_history=second_nested.input_history, + pre_handoff_items=(_get_message_output_run_item("One more handoff"),), + new_items=(), + run_context=RunContextWrapper(context=()), + ) + third_nested = nest_handoff_history(third_data) + + # Still only 2 items after three handoffs. + assert len(third_nested.input_history) == 2