Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 107 additions & 2 deletions src/agents/handoffs/history.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@
"set_conversation_history_wrappers",
]

# Content types that represent multimodal data (images, files, audio) which should be
# preserved during handoffs rather than being converted to text summaries.
_MULTIMODAL_CONTENT_TYPES = frozenset({"input_image", "input_file", "input_audio"})

_DEFAULT_CONVERSATION_HISTORY_START = "<CONVERSATION HISTORY>"
_DEFAULT_CONVERSATION_HISTORY_END = "</CONVERSATION HISTORY>"
_conversation_history_start = _DEFAULT_CONVERSATION_HISTORY_START
Expand Down Expand Up @@ -90,10 +94,28 @@ def nest_handoff_history(
def default_handoff_history_mapper(
transcript: list[TResponseInputItem],
) -> list[TResponseInputItem]:
"""Return a single assistant message summarizing the transcript."""
"""Return a summary of the transcript, preserving multimodal content.

The returned list contains:
1. An assistant message summarizing the text conversation
2. A user message with any multimodal content (images, files, audio) if present

This ensures that multimodal content uploaded by users is preserved during handoffs.
"""
multimodal_content = _extract_multimodal_content(transcript)
summary_message = _build_summary_message(transcript)
return [summary_message]

result: list[TResponseInputItem] = [summary_message]

# If there's multimodal content, add it as a user message so the next agent can see it.
if multimodal_content:
user_message: dict[str, Any] = {
"role": "user",
"content": multimodal_content,
}
result.append(cast(TResponseInputItem, user_message))

return result


def _normalize_input_history(
Expand Down Expand Up @@ -157,12 +179,61 @@ def _stringify_content(content: Any) -> str:
return ""
if isinstance(content, str):
return content
# Handle multimodal content (list of content parts).
if isinstance(content, list):
return _stringify_content_list(content)
try:
return json.dumps(content, ensure_ascii=False, default=str)
except TypeError:
return str(content)


def _stringify_content_list(content_list: list[Any]) -> str:
"""Convert a list of content parts to a human-readable string.

For multimodal content, this provides a summary that indicates the presence
of images, files, and audio without including their binary data.
"""
parts: list[str] = []
image_count = 0
file_count = 0
audio_count = 0

for part in content_list:
if isinstance(part, dict):
part_type = part.get("type")
if part_type == "input_text":
text = part.get("text", "")
if text:
parts.append(text)
elif part_type == "input_image":
image_count += 1
elif part_type == "input_file":
file_count += 1
elif part_type == "input_audio":
audio_count += 1
else:
# Unknown type, try to stringify it.
try:
parts.append(json.dumps(part, ensure_ascii=False, default=str))
except TypeError:
parts.append(str(part))
elif isinstance(part, str):
parts.append(part)
else:
parts.append(str(part))

# Add indicators for multimodal content.
if image_count > 0:
parts.append(f"[{image_count} image(s) attached]")
if file_count > 0:
parts.append(f"[{file_count} file(s) attached]")
if audio_count > 0:
parts.append(f"[{audio_count} audio file(s) attached]")

return " ".join(parts)


def _flatten_nested_history_messages(
items: list[TResponseInputItem],
) -> list[TResponseInputItem]:
Expand Down Expand Up @@ -234,3 +305,37 @@ def _split_role_and_name(role_text: str) -> tuple[str, str | None]:
def _get_run_item_role(run_item: RunItem) -> str | None:
role_candidate = run_item.to_input_item().get("role")
return role_candidate if isinstance(role_candidate, str) else None


def _extract_multimodal_content(
transcript: list[TResponseInputItem],
) -> list[dict[str, Any]]:
"""Extract multimodal content (images, files, audio) from user messages in the transcript.

This function scans through all user messages and extracts any multimodal content parts
(input_image, input_file, input_audio) so they can be preserved during handoffs.

Returns:
A list of multimodal content items, or an empty list if none found.
"""
multimodal_parts: list[dict[str, Any]] = []

for item in transcript:
# Only extract multimodal content from user messages.
role = item.get("role")
if role != "user":
continue

content = item.get("content")
if content is None:
continue

# If content is a list, check each part for multimodal types.
if isinstance(content, list):
for part in content:
if isinstance(part, dict):
part_type = part.get("type")
if part_type in _MULTIMODAL_CONTENT_TYPES:
multimodal_parts.append(deepcopy(part))

return multimodal_parts
213 changes: 213 additions & 0 deletions tests/test_extension_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,3 +398,216 @@ def map_history(items: list[TResponseInputItem]) -> list[TResponseInputItem]:
)
assert second["role"] == "user"
assert second["content"] == "Hello"


def _get_user_input_item_with_image(text: str, image_url: str) -> TResponseInputItem:
"""Create a user input item with both text and an image."""
return {
"role": "user",
"content": [
{"type": "input_text", "text": text},
{"type": "input_image", "image_url": image_url, "detail": "auto"},
],
}


def _get_user_input_item_with_file(text: str, file_data: str) -> TResponseInputItem:
"""Create a user input item with both text and a file."""
return {
"role": "user",
"content": [
{"type": "input_text", "text": text},
{"type": "input_file", "file_data": file_data, "filename": "test.txt"},
],
}


def _get_user_input_item_image_only(image_url: str) -> TResponseInputItem:
"""Create a user input item with only an image (no text)."""
return {
"role": "user",
"content": [
{"type": "input_image", "image_url": image_url, "detail": "high"},
],
}


def test_nest_handoff_history_preserves_image_content() -> None:
"""Test that image content from user messages is preserved during handoff."""
image_url = "https://example.com/test-image.jpg"
data = HandoffInputData(
input_history=(_get_user_input_item_with_image("What's in this image?", image_url),),
pre_handoff_items=(_get_message_output_run_item("I see an image"),),
new_items=(),
run_context=RunContextWrapper(context=()),
)

nested = nest_handoff_history(data)

assert isinstance(nested.input_history, tuple)
# Should have 2 items: summary message + user message with image.
assert len(nested.input_history) == 2

# First item should be the summary.
summary = _as_message(nested.input_history[0])
assert summary["role"] == "assistant"
summary_content = summary["content"]
assert isinstance(summary_content, str)
assert "What's in this image?" in summary_content
assert "[1 image(s) attached]" in summary_content

# Second item should be the preserved image content.
image_msg = _as_message(nested.input_history[1])
assert image_msg["role"] == "user"
image_content = image_msg["content"]
assert isinstance(image_content, list)
assert len(image_content) == 1
assert image_content[0]["type"] == "input_image"
assert image_content[0]["image_url"] == image_url
assert image_content[0]["detail"] == "auto"


def test_nest_handoff_history_preserves_file_content() -> None:
"""Test that file content from user messages is preserved during handoff."""
file_data = "base64encodeddata"
data = HandoffInputData(
input_history=(_get_user_input_item_with_file("Analyze this file", file_data),),
pre_handoff_items=(_get_message_output_run_item("Analyzing file"),),
new_items=(),
run_context=RunContextWrapper(context=()),
)

nested = nest_handoff_history(data)

assert isinstance(nested.input_history, tuple)
assert len(nested.input_history) == 2

# First item should be the summary.
summary = _as_message(nested.input_history[0])
summary_content = summary["content"]
assert isinstance(summary_content, str)
assert "[1 file(s) attached]" in summary_content

# Second item should be the preserved file content.
file_msg = _as_message(nested.input_history[1])
assert file_msg["role"] == "user"
file_content = file_msg["content"]
assert isinstance(file_content, list)
assert len(file_content) == 1
assert file_content[0]["type"] == "input_file"
assert file_content[0]["file_data"] == file_data


def test_nest_handoff_history_preserves_multiple_images() -> None:
"""Test that multiple images from different user messages are preserved."""
image_url1 = "https://example.com/image1.jpg"
image_url2 = "https://example.com/image2.jpg"
data = HandoffInputData(
input_history=(
_get_user_input_item_image_only(image_url1),
_get_user_input_item_image_only(image_url2),
),
pre_handoff_items=(_get_message_output_run_item("Two images received"),),
new_items=(),
run_context=RunContextWrapper(context=()),
)

nested = nest_handoff_history(data)

assert isinstance(nested.input_history, tuple)
assert len(nested.input_history) == 2

# Second item should contain both images.
image_msg = _as_message(nested.input_history[1])
assert image_msg["role"] == "user"
image_content = image_msg["content"]
assert isinstance(image_content, list)
assert len(image_content) == 2
assert image_content[0]["type"] == "input_image"
assert image_content[0]["image_url"] == image_url1
assert image_content[1]["type"] == "input_image"
assert image_content[1]["image_url"] == image_url2


def test_nest_handoff_history_no_multimodal_single_message() -> None:
"""Test that text-only messages result in a single summary message."""
data = HandoffInputData(
input_history=(_get_user_input_item("Hello, how are you?"),),
pre_handoff_items=(_get_message_output_run_item("I am fine"),),
new_items=(),
run_context=RunContextWrapper(context=()),
)

nested = nest_handoff_history(data)

assert isinstance(nested.input_history, tuple)
# Should only have 1 item (no multimodal content).
assert len(nested.input_history) == 1
summary = _as_message(nested.input_history[0])
assert summary["role"] == "assistant"


def test_nest_handoff_history_ignores_multimodal_in_assistant_messages() -> None:
"""Test that multimodal content in non-user messages is not extracted.

Only user-uploaded content should be preserved, not content from assistant responses.
"""
# Create an assistant message that somehow has image content.
assistant_with_image: TResponseInputItem = { # type: ignore[misc,assignment]
"role": "assistant",
"content": [
{"type": "output_text", "text": "Here is the image"},
{"type": "input_image", "image_url": "https://example.com/generated.jpg"},
],
}
data = HandoffInputData(
input_history=(assistant_with_image,),
pre_handoff_items=(),
new_items=(),
run_context=RunContextWrapper(context=()),
)

nested = nest_handoff_history(data)

assert isinstance(nested.input_history, tuple)
# Should only have 1 item - no additional user message with multimodal content.
assert len(nested.input_history) == 1
summary = _as_message(nested.input_history[0])
assert summary["role"] == "assistant"


def test_nest_handoff_history_preserves_audio_content() -> None:
"""Test that audio content from user messages is preserved during handoff."""
audio_data = "base64audiocontent"
user_with_audio: TResponseInputItem = { # type: ignore[misc,assignment]
"role": "user",
"content": [
{"type": "input_text", "text": "Listen to this"},
{"type": "input_audio", "input_audio": {"data": audio_data, "format": "mp3"}},
],
}
data = HandoffInputData(
input_history=(user_with_audio,),
pre_handoff_items=(_get_message_output_run_item("Audio received"),),
new_items=(),
run_context=RunContextWrapper(context=()),
)

nested = nest_handoff_history(data)

assert isinstance(nested.input_history, tuple)
assert len(nested.input_history) == 2

# Check summary mentions audio.
summary = _as_message(nested.input_history[0])
summary_content = summary["content"]
assert isinstance(summary_content, str)
assert "[1 audio file(s) attached]" in summary_content

# Check audio is preserved.
audio_msg = _as_message(nested.input_history[1])
assert audio_msg["role"] == "user"
audio_content = audio_msg["content"]
assert isinstance(audio_content, list)
assert len(audio_content) == 1
assert audio_content[0]["type"] == "input_audio"