Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 163 additions & 2 deletions src/agents/handoffs/history.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@
"set_conversation_history_wrappers",
]

# Content types that represent multimodal data (images, files, audio) which should be
# preserved during handoffs rather than being converted to text summaries.
_MULTIMODAL_CONTENT_TYPES = frozenset({"input_image", "input_file", "input_audio"})

# Marker name used to identify user messages that contain preserved multimodal content
# from a previous handoff. This prevents re-extraction and duplication across chained handoffs.
_PRESERVED_MULTIMODAL_MARKER = "__multimodal_preserved__"

_DEFAULT_CONVERSATION_HISTORY_START = "<CONVERSATION HISTORY>"
_DEFAULT_CONVERSATION_HISTORY_END = "</CONVERSATION HISTORY>"
_conversation_history_start = _DEFAULT_CONVERSATION_HISTORY_START
Expand Down Expand Up @@ -90,10 +98,39 @@ def nest_handoff_history(
def default_handoff_history_mapper(
transcript: list[TResponseInputItem],
) -> list[TResponseInputItem]:
"""Return a single assistant message summarizing the transcript."""
"""Return a summary of the transcript, preserving multimodal content.

The returned list contains:
1. An assistant message summarizing the text conversation
2. A user message with any multimodal content (images, files, audio) if present

This ensures that multimodal content uploaded by users is preserved during handoffs.
Multimodal content is only extracted once and carried forward across chained handoffs.
"""
# Extract NEW multimodal content from user messages (excludes already-preserved content).
new_multimodal_content = _extract_multimodal_content(transcript)

# Also collect any already-preserved multimodal content from previous handoffs.
existing_multimodal_content = _collect_preserved_multimodal_content(transcript)

# Combine new and existing multimodal content.
all_multimodal_content = existing_multimodal_content + new_multimodal_content

summary_message = _build_summary_message(transcript)
return [summary_message]

result: list[TResponseInputItem] = [summary_message]

# If there's multimodal content, add it as a user message so the next agent can see it.
# Mark it with a special name to prevent re-extraction in subsequent handoffs.
if all_multimodal_content:
user_message: dict[str, Any] = {
"role": "user",
"name": _PRESERVED_MULTIMODAL_MARKER,
"content": all_multimodal_content,
}
result.append(cast(TResponseInputItem, user_message))

return result


def _normalize_input_history(
Expand Down Expand Up @@ -157,12 +194,61 @@ def _stringify_content(content: Any) -> str:
return ""
if isinstance(content, str):
return content
# Handle multimodal content (list of content parts).
if isinstance(content, list):
return _stringify_content_list(content)
try:
return json.dumps(content, ensure_ascii=False, default=str)
except TypeError:
return str(content)


def _stringify_content_list(content_list: list[Any]) -> str:
"""Convert a list of content parts to a human-readable string.

For multimodal content, this provides a summary that indicates the presence
of images, files, and audio without including their binary data.
"""
parts: list[str] = []
image_count = 0
file_count = 0
audio_count = 0

for part in content_list:
if isinstance(part, dict):
part_type = part.get("type")
if part_type == "input_text":
text = part.get("text", "")
if text:
parts.append(text)
elif part_type == "input_image":
image_count += 1
elif part_type == "input_file":
file_count += 1
elif part_type == "input_audio":
audio_count += 1
else:
# Unknown type, try to stringify it.
try:
parts.append(json.dumps(part, ensure_ascii=False, default=str))
except TypeError:
parts.append(str(part))
elif isinstance(part, str):
parts.append(part)
else:
parts.append(str(part))

# Add indicators for multimodal content.
if image_count > 0:
parts.append(f"[{image_count} image(s) attached]")
if file_count > 0:
parts.append(f"[{file_count} file(s) attached]")
if audio_count > 0:
parts.append(f"[{audio_count} audio file(s) attached]")

return " ".join(parts)


def _flatten_nested_history_messages(
items: list[TResponseInputItem],
) -> list[TResponseInputItem]:
Expand Down Expand Up @@ -234,3 +320,78 @@ def _split_role_and_name(role_text: str) -> tuple[str, str | None]:
def _get_run_item_role(run_item: RunItem) -> str | None:
role_candidate = run_item.to_input_item().get("role")
return role_candidate if isinstance(role_candidate, str) else None


def _extract_multimodal_content(
transcript: list[TResponseInputItem],
) -> list[dict[str, Any]]:
"""Extract multimodal content (images, files, audio) from user messages in the transcript.

This function scans through all user messages and extracts any multimodal content parts
(input_image, input_file, input_audio) so they can be preserved during handoffs.

Returns:
A list of multimodal content items, or an empty list if none found.
"""
multimodal_parts: list[dict[str, Any]] = []

for item in transcript:
# Only extract multimodal content from user messages.
role = item.get("role")
if role != "user":
continue

# Skip messages that are already preserved multimodal content from a previous handoff.
# This prevents duplication across chained handoffs.
name = item.get("name")
if name == _PRESERVED_MULTIMODAL_MARKER:
continue

content = item.get("content")
if content is None:
continue

# If content is a list, check each part for multimodal types.
if isinstance(content, list):
for part in content:
if isinstance(part, dict):
part_type = part.get("type")
if part_type in _MULTIMODAL_CONTENT_TYPES:
multimodal_parts.append(deepcopy(part))

return multimodal_parts


def _collect_preserved_multimodal_content(
transcript: list[TResponseInputItem],
) -> list[dict[str, Any]]:
"""Collect multimodal content from messages already marked as preserved.

This function finds user messages marked with the preservation marker from previous
handoffs and collects their content to carry forward.

Returns:
A list of multimodal content items from preserved messages, or an empty list if none.
"""
preserved_parts: list[dict[str, Any]] = []

for item in transcript:
role = item.get("role")
if role != "user":
continue

name = item.get("name")
if name != _PRESERVED_MULTIMODAL_MARKER:
continue

content = item.get("content")
if content is None:
continue

# The preserved message content is a list of multimodal items.
if isinstance(content, list):
for part in content:
if isinstance(part, dict):
preserved_parts.append(deepcopy(part))

return preserved_parts
Loading