Merge pull request #1018 from asimurka/tool_call_extraction_improvement

tisnik · web-flow · commit 448cb600bee2 · 2026-01-19T18:37:26.000+01:00
LCORE-1198: RAG chunk parsing improvement for streaming query
diff --git a/src/app/endpoints/query_v2.py b/src/app/endpoints/query_v2.py
@@ -83,21 +83,24 @@
 
 def _build_tool_call_summary(  # pylint: disable=too-many-return-statements,too-many-branches
     output_item: OpenAIResponseOutput,
+    rag_chunks: list[RAGChunk],
 ) -> tuple[Optional[ToolCallSummary], Optional[ToolResultSummary]]:
     """Translate Responses API tool outputs into ToolCallSummary and ToolResultSummary records.
 
     Processes OpenAI response output items and extracts tool call and result information.
+    Also parses RAG chunks from file_search_call items and appends them to the provided list.
 
     Args:
         output_item: An OpenAIResponseOutput item from the response.output array
+        rag_chunks: List to append extracted RAG chunks to (from file_search_call items)
 
     Returns:
         A tuple of (ToolCallSummary, ToolResultSummary) one of them possibly None
         if current llama stack Responses API does not provide the information.
 
     Supported tool types:
         - function_call: Function tool calls with parsed arguments (no result)
-        - file_search_call: File search operations with results
+        - file_search_call: File search operations with results (also extracts RAG chunks)
         - web_search_call: Web search operations (incomplete)
         - mcp_call: MCP calls with server labels
         - mcp_list_tools: MCP server tool listings
@@ -120,6 +123,7 @@ def _build_tool_call_summary(  # pylint: disable=too-many-return-statements,too-
 
     if item_type == "file_search_call":
         item = cast(OpenAIResponseOutputMessageFileSearchToolCall, output_item)
+        extract_rag_chunks_from_file_search_item(item, rag_chunks)
         response_payload: Optional[dict[str, Any]] = None
         if item.results is not None:
             response_payload = {
@@ -430,12 +434,13 @@ async def retrieve_response(  # pylint: disable=too-many-locals,too-many-branche
     llm_response = ""
     tool_calls: list[ToolCallSummary] = []
     tool_results: list[ToolResultSummary] = []
+    rag_chunks: list[RAGChunk] = []
     for output_item in response.output:
         message_text = extract_text_from_response_output_item(output_item)
         if message_text:
             llm_response += message_text
 
-        tool_call, tool_result = _build_tool_call_summary(output_item)
+        tool_call, tool_result = _build_tool_call_summary(output_item, rag_chunks)
         if tool_call:
             tool_calls.append(tool_call)
         if tool_result:
@@ -447,9 +452,6 @@ async def retrieve_response(  # pylint: disable=too-many-locals,too-many-branche
         len(llm_response),
     )
 
-    # Extract rag chunks
-    rag_chunks = parse_rag_chunks_from_responses_api(response)
-
     summary = TurnSummary(
         llm_response=llm_response,
         tool_calls=tool_calls,
@@ -478,7 +480,27 @@ async def retrieve_response(  # pylint: disable=too-many-locals,too-many-branche
     )
 
 
-def parse_rag_chunks_from_responses_api(response_obj: Any) -> list[RAGChunk]:
+def extract_rag_chunks_from_file_search_item(
+    item: OpenAIResponseOutputMessageFileSearchToolCall,
+    rag_chunks: list[RAGChunk],
+) -> None:
+    """Extract RAG chunks from a file search tool call item and append to rag_chunks.
+
+    Args:
+        item: The file search tool call item.
+        rag_chunks: List to append extracted RAG chunks to.
+    """
+    if item.results is not None:
+        for result in item.results:
+            rag_chunk = RAGChunk(
+                content=result.text, source="file_search", score=result.score
+            )
+            rag_chunks.append(rag_chunk)
+
+
+def parse_rag_chunks_from_responses_api(
+    response_obj: OpenAIResponseObject,
+) -> list[RAGChunk]:
     """
     Extract rag_chunks from the llama-stack OpenAI response.
 
@@ -488,20 +510,13 @@ def parse_rag_chunks_from_responses_api(response_obj: Any) -> list[RAGChunk]:
     Returns:
         List of RAGChunk with content, source, score
     """
-    rag_chunks = []
+    rag_chunks: list[RAGChunk] = []
 
     for output_item in response_obj.output:
-        if (
-            hasattr(output_item, "type")
-            and output_item.type == "file_search_call"
-            and hasattr(output_item, "results")
-        ):
-
-            for result in output_item.results:
-                rag_chunk = RAGChunk(
-                    content=result.text, source="file_search", score=result.score
-                )
-                rag_chunks.append(rag_chunk)
+        item_type = getattr(output_item, "type", None)
+        if item_type == "file_search_call":
+            item = cast(OpenAIResponseOutputMessageFileSearchToolCall, output_item)
+            extract_rag_chunks_from_file_search_item(item, rag_chunks)
 
     return rag_chunks
 
diff --git a/src/app/endpoints/streaming_query_v2.py b/src/app/endpoints/streaming_query_v2.py
@@ -70,7 +70,7 @@
 )
 from utils.token_counter import TokenCounter
 from utils.transcripts import store_transcript
-from utils.types import TurnSummary
+from utils.types import RAGChunk, TurnSummary
 
 logger = logging.getLogger("app.endpoints.handlers")
 router = APIRouter(tags=["streaming_query_v1"])
@@ -143,6 +143,9 @@ async def response_generator(  # pylint: disable=too-many-branches,too-many-stat
         # Track the latest response object from response.completed event
         latest_response_object: Optional[Any] = None
 
+        # RAG chunks
+        rag_chunks: list[RAGChunk] = []
+
         logger.debug("Starting streaming response (Responses API) processing")
 
         async for chunk in turn_response:
@@ -198,7 +201,9 @@ async def response_generator(  # pylint: disable=too-many-branches,too-many-stat
                 )
                 if done_chunk.item.type == "message":
                     continue
-                tool_call, tool_result = _build_tool_call_summary(done_chunk.item)
+                tool_call, tool_result = _build_tool_call_summary(
+                    done_chunk.item, rag_chunks
+                )
                 if tool_call:
                     summary.tool_calls.append(tool_call)
                     yield stream_event(
@@ -321,7 +326,7 @@ async def response_generator(  # pylint: disable=too-many-branches,too-many-stat
             is_transcripts_enabled_func=is_transcripts_enabled,
             store_transcript_func=store_transcript,
             persist_user_conversation_details_func=persist_user_conversation_details,
-            rag_chunks=[],  # Responses API uses empty list for rag_chunks
+            rag_chunks=[rag_chunk.model_dump() for rag_chunk in rag_chunks],
         )
 
     return response_generator
diff --git a/src/utils/endpoints.py b/src/utils/endpoints.py
@@ -747,7 +747,7 @@ async def cleanup_after_streaming(
         is_transcripts_enabled_func: Function to check if transcripts are enabled
         store_transcript_func: Function to store transcript
         persist_user_conversation_details_func: Function to persist conversation details
-        rag_chunks: Optional RAG chunks dict (for Agent API, None for Responses API)
+        rag_chunks: Optional RAG chunks dict
     """
     # Store transcript if enabled
     if not is_transcripts_enabled_func():