lightspeed-core · tisnik · Jan 19, 2026 · Jan 19, 2026 · Jan 19, 2026
diff --git a/dev-tools/mcp-mock-server/server.py b/dev-tools/mcp-mock-server/server.py
@@ -25,7 +25,6 @@
 from pathlib import Path
 from typing import Any
 
-
 # Global storage for captured headers (last request)
 last_headers: dict[str, str] = {}
 request_log: list = []

diff --git a/src/app/endpoints/query_v2.py b/src/app/endpoints/query_v2.py
@@ -83,21 +83,24 @@
 
 def _build_tool_call_summary(  # pylint: disable=too-many-return-statements,too-many-branches
     output_item: OpenAIResponseOutput,
+    rag_chunks: list[RAGChunk],
 ) -> tuple[Optional[ToolCallSummary], Optional[ToolResultSummary]]:
     """Translate Responses API tool outputs into ToolCallSummary and ToolResultSummary records.
 
     Processes OpenAI response output items and extracts tool call and result information.
+    Also parses RAG chunks from file_search_call items and appends them to the provided list.
 
     Args:
         output_item: An OpenAIResponseOutput item from the response.output array
+        rag_chunks: List to append extracted RAG chunks to (from file_search_call items)
 
     Returns:
         A tuple of (ToolCallSummary, ToolResultSummary) one of them possibly None
         if current llama stack Responses API does not provide the information.
 
     Supported tool types:
         - function_call: Function tool calls with parsed arguments (no result)
-        - file_search_call: File search operations with results
+        - file_search_call: File search operations with results (also extracts RAG chunks)
         - web_search_call: Web search operations (incomplete)
         - mcp_call: MCP calls with server labels
         - mcp_list_tools: MCP server tool listings
@@ -120,6 +123,7 @@ def _build_tool_call_summary(  # pylint: disable=too-many-return-statements,too-
 
     if item_type == "file_search_call":
         item = cast(OpenAIResponseOutputMessageFileSearchToolCall, output_item)
+        extract_rag_chunks_from_file_search_item(item, rag_chunks)
         response_payload: Optional[dict[str, Any]] = None
         if item.results is not None:
             response_payload = {
@@ -431,12 +435,13 @@ async def retrieve_response(  # pylint: disable=too-many-locals,too-many-branche
     llm_response = ""
     tool_calls: list[ToolCallSummary] = []
     tool_results: list[ToolResultSummary] = []
+    rag_chunks: list[RAGChunk] = []
     for output_item in response.output:
         message_text = extract_text_from_response_output_item(output_item)
         if message_text:
             llm_response += message_text
 
-        tool_call, tool_result = _build_tool_call_summary(output_item)
+        tool_call, tool_result = _build_tool_call_summary(output_item, rag_chunks)
         if tool_call:
             tool_calls.append(tool_call)
         if tool_result:
@@ -448,9 +453,6 @@ async def retrieve_response(  # pylint: disable=too-many-locals,too-many-branche
         len(llm_response),
     )
 
-    # Extract rag chunks
-    rag_chunks = parse_rag_chunks_from_responses_api(response)
-
     summary = TurnSummary(
         llm_response=llm_response,
         tool_calls=tool_calls,
@@ -479,7 +481,27 @@ async def retrieve_response(  # pylint: disable=too-many-locals,too-many-branche
     )
 
 
-def parse_rag_chunks_from_responses_api(response_obj: Any) -> list[RAGChunk]:
+def extract_rag_chunks_from_file_search_item(
+    item: OpenAIResponseOutputMessageFileSearchToolCall,
+    rag_chunks: list[RAGChunk],
+) -> None:
+    """Extract RAG chunks from a file search tool call item and append to rag_chunks.
+
+    Args:
+        item: The file search tool call item.
+        rag_chunks: List to append extracted RAG chunks to.
+    """
+    if item.results is not None:
+        for result in item.results:
+            rag_chunk = RAGChunk(
+                content=result.text, source="file_search", score=result.score
+            )
+            rag_chunks.append(rag_chunk)
+
+
+def parse_rag_chunks_from_responses_api(
+    response_obj: OpenAIResponseObject,
+) -> list[RAGChunk]:
     """
     Extract rag_chunks from the llama-stack OpenAI response.
 
@@ -489,20 +511,13 @@ def parse_rag_chunks_from_responses_api(response_obj: Any) -> list[RAGChunk]:
     Returns:
         List of RAGChunk with content, source, score
     """
-    rag_chunks = []
+    rag_chunks: list[RAGChunk] = []
 
     for output_item in response_obj.output:
-        if (
-            hasattr(output_item, "type")
-            and output_item.type == "file_search_call"
-            and hasattr(output_item, "results")
-        ):
-
-            for result in output_item.results:
-                rag_chunk = RAGChunk(
-                    content=result.text, source="file_search", score=result.score
-                )
-                rag_chunks.append(rag_chunk)
+        item_type = getattr(output_item, "type", None)
+        if item_type == "file_search_call":
+            item = cast(OpenAIResponseOutputMessageFileSearchToolCall, output_item)
+            extract_rag_chunks_from_file_search_item(item, rag_chunks)
 
     return rag_chunks
 

diff --git a/src/app/endpoints/streaming_query_v2.py b/src/app/endpoints/streaming_query_v2.py
@@ -70,7 +70,7 @@
 )
 from utils.token_counter import TokenCounter
 from utils.transcripts import store_transcript
-from utils.types import TurnSummary
+from utils.types import RAGChunk, TurnSummary
 
 logger = logging.getLogger("app.endpoints.handlers")
 router = APIRouter(tags=["streaming_query_v1"])
@@ -143,6 +143,9 @@ async def response_generator(  # pylint: disable=too-many-branches,too-many-stat
         # Track the latest response object from response.completed event
         latest_response_object: Optional[Any] = None
 
+        # RAG chunks
+        rag_chunks: list[RAGChunk] = []
+
         logger.debug("Starting streaming response (Responses API) processing")
 
         async for chunk in turn_response:
@@ -198,7 +201,9 @@ async def response_generator(  # pylint: disable=too-many-branches,too-many-stat
                 )
                 if done_chunk.item.type == "message":
                     continue
-                tool_call, tool_result = _build_tool_call_summary(done_chunk.item)
+                tool_call, tool_result = _build_tool_call_summary(
+                    done_chunk.item, rag_chunks
+                )
                 if tool_call:
                     summary.tool_calls.append(tool_call)
                     yield stream_event(
@@ -321,7 +326,7 @@ async def response_generator(  # pylint: disable=too-many-branches,too-many-stat
             is_transcripts_enabled_func=is_transcripts_enabled,
             store_transcript_func=store_transcript,
             persist_user_conversation_details_func=persist_user_conversation_details,
-            rag_chunks=[],  # Responses API uses empty list for rag_chunks
+            rag_chunks=[rag_chunk.model_dump() for rag_chunk in rag_chunks],
         )
 
     return response_generator

diff --git a/src/client.py b/src/client.py
@@ -11,7 +11,6 @@
 from models.config import LlamaStackConfiguration
 from utils.types import Singleton
 
-
 logger = logging.getLogger(__name__)
 
 

diff --git a/src/quota/quota_limiter.py b/src/quota/quota_limiter.py
@@ -42,7 +42,6 @@
 from quota.connect_pg import connect_pg
 from quota.connect_sqlite import connect_sqlite
 
-
 logger = get_logger(__name__)
 
 

diff --git a/src/utils/endpoints.py b/src/utils/endpoints.py
@@ -747,7 +747,7 @@ async def cleanup_after_streaming(
         is_transcripts_enabled_func: Function to check if transcripts are enabled
         store_transcript_func: Function to store transcript
         persist_user_conversation_details_func: Function to persist conversation details
-        rag_chunks: Optional RAG chunks dict (for Agent API, None for Responses API)
+        rag_chunks: Optional RAG chunks dict
     """
     # Store transcript if enabled
     if not is_transcripts_enabled_func():

diff --git a/src/utils/mcp_headers.py b/src/utils/mcp_headers.py
@@ -8,7 +8,6 @@
 
 from configuration import AppConfig
 
-
 logger = logging.getLogger("app.endpoints.dependencies")
 
 

diff --git a/tests/e2e/features/steps/llm_query_response.py b/tests/e2e/features/steps/llm_query_response.py
@@ -6,7 +6,6 @@
 from behave.runner import Context
 from tests.e2e.utils.utils import replace_placeholders
 
-
 DEFAULT_LLM_TIMEOUT = 60
 
 

diff --git a/tests/unit/app/endpoints/test_a2a.py b/tests/unit/app/endpoints/test_a2a.py
@@ -39,7 +39,6 @@
 from configuration import AppConfig
 from models.config import Action
 
-
 # User ID must be proper UUID
 MOCK_AUTH = (
     "00000001-0001-0001-0001-000000000001",

diff --git a/tests/unit/app/endpoints/test_streaming_query.py b/tests/unit/app/endpoints/test_streaming_query.py
@@ -43,7 +43,6 @@
 from tests.unit.utils.auth_helpers import mock_authorization_resolvers
 from utils.token_counter import TokenCounter
 
-
 # Note: content_delta module doesn't exist in llama-stack-client 0.3.x
 # These are mock classes for backward compatibility with Agent API tests
 # pylint: disable=too-few-public-methods,redefined-builtin

diff --git a/tests/unit/cache/test_postgres_cache.py b/tests/unit/cache/test_postgres_cache.py
@@ -18,7 +18,6 @@
 from cache.cache_error import CacheError
 from cache.postgres_cache import PostgresCache
 
-
 USER_ID_1 = suid.get_suid()
 USER_ID_2 = suid.get_suid()
 CONVERSATION_ID_1 = suid.get_suid()

diff --git a/tests/unit/models/rlsapi/test_requests.py b/tests/unit/models/rlsapi/test_requests.py
@@ -15,7 +15,6 @@
     RlsapiV1Terminal,
 )
 
-
 # ---------------------------------------------------------------------------
 # Fixtures
 # ---------------------------------------------------------------------------

diff --git a/tests/unit/models/rlsapi/test_responses.py b/tests/unit/models/rlsapi/test_responses.py
@@ -12,7 +12,6 @@
 )
 from models.responses import AbstractSuccessfulResponse
 
-
 # ---------------------------------------------------------------------------
 # Fixtures
 # ---------------------------------------------------------------------------
Original file line number	Diff line number	Diff line change
Expand Up		@@ -11,7 +11,6 @@
		from models.config import LlamaStackConfiguration
		from utils.types import Singleton


		logger = logging.getLogger(__name__)


Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -42,7 +42,6 @@
		from quota.connect_pg import connect_pg
		from quota.connect_sqlite import connect_sqlite


		logger = get_logger(__name__)


Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -8,7 +8,6 @@

		from configuration import AppConfig


		logger = logging.getLogger("app.endpoints.dependencies")


Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -6,7 +6,6 @@
		from behave.runner import Context
		from tests.e2e.utils.utils import replace_placeholders


		DEFAULT_LLM_TIMEOUT = 60


Expand Down