Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion dev-tools/mcp-mock-server/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from pathlib import Path
from typing import Any


# Global storage for captured headers (last request)
last_headers: dict[str, str] = {}
request_log: list = []
Expand Down
51 changes: 33 additions & 18 deletions src/app/endpoints/query_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,21 +83,24 @@

def _build_tool_call_summary( # pylint: disable=too-many-return-statements,too-many-branches
output_item: OpenAIResponseOutput,
rag_chunks: list[RAGChunk],
) -> tuple[Optional[ToolCallSummary], Optional[ToolResultSummary]]:
"""Translate Responses API tool outputs into ToolCallSummary and ToolResultSummary records.

Processes OpenAI response output items and extracts tool call and result information.
Also parses RAG chunks from file_search_call items and appends them to the provided list.

Args:
output_item: An OpenAIResponseOutput item from the response.output array
rag_chunks: List to append extracted RAG chunks to (from file_search_call items)

Returns:
A tuple of (ToolCallSummary, ToolResultSummary) one of them possibly None
if current llama stack Responses API does not provide the information.

Supported tool types:
- function_call: Function tool calls with parsed arguments (no result)
- file_search_call: File search operations with results
- file_search_call: File search operations with results (also extracts RAG chunks)
- web_search_call: Web search operations (incomplete)
- mcp_call: MCP calls with server labels
- mcp_list_tools: MCP server tool listings
Expand All @@ -120,6 +123,7 @@ def _build_tool_call_summary( # pylint: disable=too-many-return-statements,too-

if item_type == "file_search_call":
item = cast(OpenAIResponseOutputMessageFileSearchToolCall, output_item)
extract_rag_chunks_from_file_search_item(item, rag_chunks)
response_payload: Optional[dict[str, Any]] = None
if item.results is not None:
response_payload = {
Expand Down Expand Up @@ -431,12 +435,13 @@ async def retrieve_response( # pylint: disable=too-many-locals,too-many-branche
llm_response = ""
tool_calls: list[ToolCallSummary] = []
tool_results: list[ToolResultSummary] = []
rag_chunks: list[RAGChunk] = []
for output_item in response.output:
message_text = extract_text_from_response_output_item(output_item)
if message_text:
llm_response += message_text

tool_call, tool_result = _build_tool_call_summary(output_item)
tool_call, tool_result = _build_tool_call_summary(output_item, rag_chunks)
if tool_call:
tool_calls.append(tool_call)
if tool_result:
Expand All @@ -448,9 +453,6 @@ async def retrieve_response( # pylint: disable=too-many-locals,too-many-branche
len(llm_response),
)

# Extract rag chunks
rag_chunks = parse_rag_chunks_from_responses_api(response)

summary = TurnSummary(
llm_response=llm_response,
tool_calls=tool_calls,
Expand Down Expand Up @@ -479,7 +481,27 @@ async def retrieve_response( # pylint: disable=too-many-locals,too-many-branche
)


def parse_rag_chunks_from_responses_api(response_obj: Any) -> list[RAGChunk]:
def extract_rag_chunks_from_file_search_item(
item: OpenAIResponseOutputMessageFileSearchToolCall,
rag_chunks: list[RAGChunk],
) -> None:
"""Extract RAG chunks from a file search tool call item and append to rag_chunks.

Args:
item: The file search tool call item.
rag_chunks: List to append extracted RAG chunks to.
"""
if item.results is not None:
for result in item.results:
rag_chunk = RAGChunk(
content=result.text, source="file_search", score=result.score
)
rag_chunks.append(rag_chunk)


def parse_rag_chunks_from_responses_api(
response_obj: OpenAIResponseObject,
) -> list[RAGChunk]:
"""
Extract rag_chunks from the llama-stack OpenAI response.

Expand All @@ -489,20 +511,13 @@ def parse_rag_chunks_from_responses_api(response_obj: Any) -> list[RAGChunk]:
Returns:
List of RAGChunk with content, source, score
"""
rag_chunks = []
rag_chunks: list[RAGChunk] = []

for output_item in response_obj.output:
if (
hasattr(output_item, "type")
and output_item.type == "file_search_call"
and hasattr(output_item, "results")
):

for result in output_item.results:
rag_chunk = RAGChunk(
content=result.text, source="file_search", score=result.score
)
rag_chunks.append(rag_chunk)
item_type = getattr(output_item, "type", None)
if item_type == "file_search_call":
item = cast(OpenAIResponseOutputMessageFileSearchToolCall, output_item)
extract_rag_chunks_from_file_search_item(item, rag_chunks)

return rag_chunks

Expand Down
11 changes: 8 additions & 3 deletions src/app/endpoints/streaming_query_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@
)
from utils.token_counter import TokenCounter
from utils.transcripts import store_transcript
from utils.types import TurnSummary
from utils.types import RAGChunk, TurnSummary

logger = logging.getLogger("app.endpoints.handlers")
router = APIRouter(tags=["streaming_query_v1"])
Expand Down Expand Up @@ -143,6 +143,9 @@ async def response_generator( # pylint: disable=too-many-branches,too-many-stat
# Track the latest response object from response.completed event
latest_response_object: Optional[Any] = None

# RAG chunks
rag_chunks: list[RAGChunk] = []

logger.debug("Starting streaming response (Responses API) processing")

async for chunk in turn_response:
Expand Down Expand Up @@ -198,7 +201,9 @@ async def response_generator( # pylint: disable=too-many-branches,too-many-stat
)
if done_chunk.item.type == "message":
continue
tool_call, tool_result = _build_tool_call_summary(done_chunk.item)
tool_call, tool_result = _build_tool_call_summary(
done_chunk.item, rag_chunks
)
if tool_call:
summary.tool_calls.append(tool_call)
yield stream_event(
Expand Down Expand Up @@ -321,7 +326,7 @@ async def response_generator( # pylint: disable=too-many-branches,too-many-stat
is_transcripts_enabled_func=is_transcripts_enabled,
store_transcript_func=store_transcript,
persist_user_conversation_details_func=persist_user_conversation_details,
rag_chunks=[], # Responses API uses empty list for rag_chunks
rag_chunks=[rag_chunk.model_dump() for rag_chunk in rag_chunks],
)

return response_generator
Expand Down
1 change: 0 additions & 1 deletion src/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from models.config import LlamaStackConfiguration
from utils.types import Singleton


logger = logging.getLogger(__name__)


Expand Down
1 change: 0 additions & 1 deletion src/quota/quota_limiter.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
from quota.connect_pg import connect_pg
from quota.connect_sqlite import connect_sqlite


logger = get_logger(__name__)


Expand Down
2 changes: 1 addition & 1 deletion src/utils/endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -747,7 +747,7 @@ async def cleanup_after_streaming(
is_transcripts_enabled_func: Function to check if transcripts are enabled
store_transcript_func: Function to store transcript
persist_user_conversation_details_func: Function to persist conversation details
rag_chunks: Optional RAG chunks dict (for Agent API, None for Responses API)
rag_chunks: Optional RAG chunks dict
"""
# Store transcript if enabled
if not is_transcripts_enabled_func():
Expand Down
1 change: 0 additions & 1 deletion src/utils/mcp_headers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

from configuration import AppConfig


logger = logging.getLogger("app.endpoints.dependencies")


Expand Down
1 change: 0 additions & 1 deletion tests/e2e/features/steps/llm_query_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from behave.runner import Context
from tests.e2e.utils.utils import replace_placeholders


DEFAULT_LLM_TIMEOUT = 60


Expand Down
1 change: 0 additions & 1 deletion tests/unit/app/endpoints/test_a2a.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
from configuration import AppConfig
from models.config import Action


# User ID must be proper UUID
MOCK_AUTH = (
"00000001-0001-0001-0001-000000000001",
Expand Down
1 change: 0 additions & 1 deletion tests/unit/app/endpoints/test_streaming_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
from tests.unit.utils.auth_helpers import mock_authorization_resolvers
from utils.token_counter import TokenCounter


# Note: content_delta module doesn't exist in llama-stack-client 0.3.x
# These are mock classes for backward compatibility with Agent API tests
# pylint: disable=too-few-public-methods,redefined-builtin
Expand Down
1 change: 0 additions & 1 deletion tests/unit/cache/test_postgres_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from cache.cache_error import CacheError
from cache.postgres_cache import PostgresCache


USER_ID_1 = suid.get_suid()
USER_ID_2 = suid.get_suid()
CONVERSATION_ID_1 = suid.get_suid()
Expand Down
1 change: 0 additions & 1 deletion tests/unit/models/rlsapi/test_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
RlsapiV1Terminal,
)


# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
Expand Down
1 change: 0 additions & 1 deletion tests/unit/models/rlsapi/test_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
)
from models.responses import AbstractSuccessfulResponse


# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
Expand Down
Loading
Loading