diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 98270f7b86..b93f6a380a 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -40,256 +40,229 @@ } ], "paths": { - "/v1/datasetio/rows": { - "get": { + "/v1/eval/tasks/{task_id}/evaluations": { + "post": { "responses": { "200": { "description": "OK", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/PaginatedRowsResult" + "$ref": "#/components/schemas/EvaluateResponse" } } } } }, "tags": [ - "DatasetIO" + "Eval" ], "description": "", "parameters": [ { - "name": "dataset_id", - "in": "query", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "rows_in_page", - "in": "query", + "name": "task_id", + "in": "path", "required": true, - "schema": { - "type": "integer" - } - }, - { - "name": "page_token", - "in": "query", - "required": false, - "schema": { - "type": "string" - } - }, - { - "name": "filter_condition", - "in": "query", - "required": false, "schema": { "type": "string" } } - ] - }, - "post": { - "responses": { - "200": { - "description": "OK" - } - }, - "tags": [ - "DatasetIO" ], - "description": "", - "parameters": [], "requestBody": { "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/AppendRowsRequest" + "$ref": "#/components/schemas/DeprecatedEvaluateRowsRequest" } } }, "required": true - } + }, + "deprecated": true } }, - "/v1/batch-inference/chat-completion": { - "post": { + "/v1/eval-tasks/{task_id}": { + "get": { "responses": { "200": { "description": "OK", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/BatchChatCompletionResponse" + "oneOf": [ + { + "$ref": "#/components/schemas/Benchmark" + }, + { + "type": "null" + } + ] } } } } }, "tags": [ - "BatchInference (Coming Soon)" + "Benchmarks" ], "description": "", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/BatchChatCompletionRequest" - } + "parameters": [ + { + "name": "eval_task_id", + "in": "query", + "required": true, + "schema": { + "type": "string" } - }, - "required": true - } + } + ], + "deprecated": true } }, - "/v1/batch-inference/completion": { - "post": { + "/v1/eval/tasks/{task_id}/jobs/{job_id}": { + "get": { "responses": { "200": { "description": "OK", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/BatchCompletionResponse" + "oneOf": [ + { + "$ref": "#/components/schemas/JobStatus" + }, + { + "type": "null" + } + ] } } } } }, "tags": [ - "BatchInference (Coming Soon)" + "Eval" ], "description": "", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/BatchCompletionRequest" - } + "parameters": [ + { + "name": "task_id", + "in": "path", + "required": true, + "schema": { + "type": "string" } }, - "required": true - } - } - }, - "/v1/post-training/job/cancel": { - "post": { + { + "name": "job_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ], + "deprecated": true + }, + "delete": { "responses": { "200": { "description": "OK" } }, "tags": [ - "PostTraining (Coming Soon)" + "Eval" ], "description": "", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/CancelTrainingJobRequest" - } + "parameters": [ + { + "name": "task_id", + "in": "path", + "required": true, + "schema": { + "type": "string" } }, - "required": true - } + { + "name": "job_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ], + "deprecated": true } }, - "/v1/inference/chat-completion": { - "post": { + "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": { + "get": { "responses": { "200": { - "description": "If stream=False, returns a ChatCompletionResponse with the full completion. If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk", + "description": "OK", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ChatCompletionResponse" - } - }, - "text/event-stream": { - "schema": { - "$ref": "#/components/schemas/ChatCompletionResponseStreamChunk" + "$ref": "#/components/schemas/EvaluateResponse" } } } } }, "tags": [ - "Inference" + "Eval" ], - "description": "Generate a chat completion for the given messages using the specified model.", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ChatCompletionRequest" - } + "description": "", + "parameters": [ + { + "name": "task_id", + "in": "path", + "required": true, + "schema": { + "type": "string" } }, - "required": true - } + { + "name": "job_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ], + "deprecated": true } }, - "/v1/inference/completion": { - "post": { + "/v1/eval-tasks": { + "get": { "responses": { "200": { - "description": "If stream=False, returns a CompletionResponse with the full completion. If stream=True, returns an SSE event stream of CompletionResponseStreamChunk", + "description": "OK", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/CompletionResponse" - } - }, - "text/event-stream": { - "schema": { - "$ref": "#/components/schemas/CompletionResponseStreamChunk" + "$ref": "#/components/schemas/ListBenchmarksResponse" } } } } }, "tags": [ - "Inference" + "Benchmarks" ], - "description": "Generate a completion for the given content using the specified model.", + "description": "", "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/CompletionRequest" - } - } - }, - "required": true - } - } - }, - "/v1/agents": { + "deprecated": true + }, "post": { "responses": { "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/AgentCreateResponse" - } - } - } + "description": "OK" } }, "tags": [ - "Agents" + "Benchmarks" ], "description": "", "parameters": [], @@ -297,15 +270,16 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/CreateAgentRequest" + "$ref": "#/components/schemas/DeprecatedRegisterEvalTaskRequest" } } }, "required": true - } + }, + "deprecated": true } }, - "/v1/agents/{agent_id}/session": { + "/v1/eval/tasks/{task_id}/jobs": { "post": { "responses": { "200": { @@ -313,19 +287,19 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/AgentSessionCreateResponse" + "$ref": "#/components/schemas/Job" } } } } }, "tags": [ - "Agents" + "Eval" ], "description": "", "parameters": [ { - "name": "agent_id", + "name": "task_id", "in": "path", "required": true, "schema": { @@ -337,60 +311,84 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/CreateAgentSessionRequest" + "$ref": "#/components/schemas/DeprecatedRunEvalRequest" } } }, "required": true - } + }, + "deprecated": true } }, - "/v1/agents/{agent_id}/session/{session_id}/turn": { - "post": { + "/v1/datasetio/rows": { + "get": { "responses": { "200": { - "description": "A single turn in an interaction with an Agentic System. **OR** streamed agent turn completion response.", + "description": "OK", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/Turn" - } - }, - "text/event-stream": { - "schema": { - "$ref": "#/components/schemas/AgentTurnResponseStreamChunk" + "$ref": "#/components/schemas/PaginatedRowsResult" } } } } }, "tags": [ - "Agents" + "DatasetIO" ], "description": "", "parameters": [ { - "name": "agent_id", - "in": "path", + "name": "dataset_id", + "in": "query", "required": true, "schema": { "type": "string" } }, { - "name": "session_id", - "in": "path", + "name": "rows_in_page", + "in": "query", "required": true, + "schema": { + "type": "integer" + } + }, + { + "name": "page_token", + "in": "query", + "required": false, + "schema": { + "type": "string" + } + }, + { + "name": "filter_condition", + "in": "query", + "required": false, "schema": { "type": "string" } } + ] + }, + "post": { + "responses": { + "200": { + "description": "OK" + } + }, + "tags": [ + "DatasetIO" ], + "description": "", + "parameters": [], "requestBody": { "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/CreateAgentTurnRequest" + "$ref": "#/components/schemas/AppendRowsRequest" } } }, @@ -398,116 +396,106 @@ } } }, - "/v1/agents/{agent_id}": { - "delete": { + "/v1/batch-inference/chat-completion": { + "post": { "responses": { "200": { - "description": "OK" + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/BatchChatCompletionResponse" + } + } + } } }, "tags": [ - "Agents" + "BatchInference (Coming Soon)" ], "description": "", - "parameters": [ - { - "name": "agent_id", - "in": "path", - "required": true, - "schema": { - "type": "string" + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/BatchChatCompletionRequest" + } } - } - ] + }, + "required": true + } } }, - "/v1/agents/{agent_id}/session/{session_id}": { - "get": { + "/v1/batch-inference/completion": { + "post": { "responses": { "200": { "description": "OK", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/Session" + "$ref": "#/components/schemas/BatchCompletionResponse" } } } } }, "tags": [ - "Agents" + "BatchInference (Coming Soon)" ], "description": "", - "parameters": [ - { - "name": "session_id", - "in": "path", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "agent_id", - "in": "path", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "turn_ids", - "in": "query", - "required": false, - "schema": { - "type": "array", - "items": { - "type": "string" + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/BatchCompletionRequest" } } - } - ] - }, - "delete": { + }, + "required": true + } + } + }, + "/v1/post-training/job/cancel": { + "post": { "responses": { "200": { "description": "OK" } }, "tags": [ - "Agents" + "PostTraining (Coming Soon)" ], "description": "", - "parameters": [ - { - "name": "session_id", - "in": "path", - "required": true, - "schema": { - "type": "string" + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CancelTrainingJobRequest" + } } }, - { - "name": "agent_id", - "in": "path", - "required": true, - "schema": { - "type": "string" - } - } - ] + "required": true + } } }, - "/v1/inference/embeddings": { + "/v1/inference/chat-completion": { "post": { "responses": { "200": { - "description": "An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}", + "description": "If stream=False, returns a ChatCompletionResponse with the full completion. If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EmbeddingsResponse" + "$ref": "#/components/schemas/ChatCompletionResponse" + } + }, + "text/event-stream": { + "schema": { + "$ref": "#/components/schemas/ChatCompletionResponseStreamChunk" } } } @@ -516,13 +504,13 @@ "tags": [ "Inference" ], - "description": "Generate embeddings for content pieces using the specified model.", + "description": "Generate a chat completion for the given messages using the specified model.", "parameters": [], "requestBody": { "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EmbeddingsRequest" + "$ref": "#/components/schemas/ChatCompletionRequest" } } }, @@ -530,39 +518,35 @@ } } }, - "/v1/eval/tasks/{task_id}/evaluations": { + "/v1/inference/completion": { "post": { "responses": { "200": { - "description": "OK", + "description": "If stream=False, returns a CompletionResponse with the full completion. If stream=True, returns an SSE event stream of CompletionResponseStreamChunk", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EvaluateResponse" + "$ref": "#/components/schemas/CompletionResponse" + } + }, + "text/event-stream": { + "schema": { + "$ref": "#/components/schemas/CompletionResponseStreamChunk" } } } } }, "tags": [ - "Eval" - ], - "description": "", - "parameters": [ - { - "name": "task_id", - "in": "path", - "required": true, - "schema": { - "type": "string" - } - } + "Inference" ], + "description": "Generate a completion for the given content using the specified model.", + "parameters": [], "requestBody": { "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EvaluateRowsRequest" + "$ref": "#/components/schemas/CompletionRequest" } } }, @@ -570,15 +554,15 @@ } } }, - "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}": { - "get": { + "/v1/agents": { + "post": { "responses": { "200": { "description": "OK", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/AgentStepResponse" + "$ref": "#/components/schemas/AgentCreateResponse" } } } @@ -588,51 +572,28 @@ "Agents" ], "description": "", - "parameters": [ - { - "name": "agent_id", - "in": "path", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "session_id", - "in": "path", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "turn_id", - "in": "path", - "required": true, - "schema": { - "type": "string" + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CreateAgentRequest" + } } }, - { - "name": "step_id", - "in": "path", - "required": true, - "schema": { - "type": "string" - } - } - ] + "required": true + } } }, - "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}": { - "get": { + "/v1/agents/{agent_id}/session": { + "post": { "responses": { "200": { "description": "OK", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/Turn" + "$ref": "#/components/schemas/AgentSessionCreateResponse" } } } @@ -650,112 +611,87 @@ "schema": { "type": "string" } - }, - { - "name": "session_id", - "in": "path", - "required": true, - "schema": { - "type": "string" + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CreateAgentSessionRequest" + } } }, - { - "name": "turn_id", - "in": "path", - "required": true, - "schema": { - "type": "string" - } - } - ] + "required": true + } } }, - "/v1/datasets/{dataset_id}": { - "get": { + "/v1/agents/{agent_id}/session/{session_id}/turn": { + "post": { "responses": { "200": { - "description": "OK", + "description": "A single turn in an interaction with an Agentic System. **OR** streamed agent turn completion response.", "content": { "application/json": { "schema": { - "oneOf": [ - { - "$ref": "#/components/schemas/Dataset" - }, - { - "type": "null" - } - ] + "$ref": "#/components/schemas/Turn" + } + }, + "text/event-stream": { + "schema": { + "$ref": "#/components/schemas/AgentTurnResponseStreamChunk" } } } } }, "tags": [ - "Datasets" + "Agents" ], "description": "", "parameters": [ { - "name": "dataset_id", + "name": "agent_id", "in": "path", "required": true, "schema": { "type": "string" } - } - ] - }, - "delete": { - "responses": { - "200": { - "description": "OK" - } - }, - "tags": [ - "Datasets" - ], - "description": "", - "parameters": [ + }, { - "name": "dataset_id", + "name": "session_id", "in": "path", "required": true, "schema": { "type": "string" } } - ] + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CreateAgentTurnRequest" + } + } + }, + "required": true + } } }, - "/v1/eval-tasks/{eval_task_id}": { - "get": { + "/v1/agents/{agent_id}": { + "delete": { "responses": { "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "oneOf": [ - { - "$ref": "#/components/schemas/EvalTask" - }, - { - "type": "null" - } - ] - } - } - } + "description": "OK" } }, "tags": [ - "EvalTasks" + "Agents" ], "description": "", "parameters": [ { - "name": "eval_task_id", + "name": "agent_id", "in": "path", "required": true, "schema": { @@ -765,7 +701,7 @@ ] } }, - "/v1/models/{model_id}": { + "/v1/agents/{agent_id}/session/{session_id}": { "get": { "responses": { "200": { @@ -773,31 +709,43 @@ "content": { "application/json": { "schema": { - "oneOf": [ - { - "$ref": "#/components/schemas/Model" - }, - { - "type": "null" - } - ] + "$ref": "#/components/schemas/Session" } } } } }, "tags": [ - "Models" + "Agents" ], "description": "", "parameters": [ { - "name": "model_id", + "name": "session_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "agent_id", "in": "path", "required": true, "schema": { "type": "string" } + }, + { + "name": "turn_ids", + "in": "query", + "required": false, + "schema": { + "type": "array", + "items": { + "type": "string" + } + } } ] }, @@ -808,12 +756,20 @@ } }, "tags": [ - "Models" + "Agents" ], "description": "", "parameters": [ { - "name": "model_id", + "name": "session_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "agent_id", "in": "path", "required": true, "schema": { @@ -823,81 +779,78 @@ ] } }, - "/v1/scoring-functions/{scoring_fn_id}": { - "get": { + "/v1/inference/embeddings": { + "post": { "responses": { "200": { - "description": "OK", + "description": "An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}", "content": { "application/json": { "schema": { - "oneOf": [ - { - "$ref": "#/components/schemas/ScoringFn" - }, - { - "type": "null" - } - ] + "$ref": "#/components/schemas/EmbeddingsResponse" } } } } }, "tags": [ - "ScoringFunctions" + "Inference" ], - "description": "", - "parameters": [ - { - "name": "scoring_fn_id", - "in": "path", - "required": true, - "schema": { - "type": "string" + "description": "Generate embeddings for content pieces using the specified model.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EmbeddingsRequest" + } } - } - ] + }, + "required": true + } } }, - "/v1/shields/{identifier}": { - "get": { + "/v1/eval/benchmarks/{benchmark_id}/evaluations": { + "post": { "responses": { "200": { "description": "OK", "content": { "application/json": { "schema": { - "oneOf": [ - { - "$ref": "#/components/schemas/Shield" - }, - { - "type": "null" - } - ] + "$ref": "#/components/schemas/EvaluateResponse" } } } } }, "tags": [ - "Shields" + "Eval" ], "description": "", "parameters": [ { - "name": "identifier", + "name": "benchmark_id", "in": "path", "required": true, "schema": { "type": "string" } } - ] + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluateRowsRequest" + } + } + }, + "required": true + } } }, - "/v1/telemetry/traces/{trace_id}/spans/{span_id}": { + "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}": { "get": { "responses": { "200": { @@ -905,19 +858,19 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/Span" + "$ref": "#/components/schemas/AgentStepResponse" } } } } }, "tags": [ - "Telemetry" + "Agents" ], "description": "", "parameters": [ { - "name": "trace_id", + "name": "agent_id", "in": "path", "required": true, "schema": { @@ -925,7 +878,23 @@ } }, { - "name": "span_id", + "name": "session_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "turn_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "step_id", "in": "path", "required": true, "schema": { @@ -935,7 +904,7 @@ ] } }, - "/v1/telemetry/spans/{span_id}/tree": { + "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}": { "get": { "responses": { "200": { @@ -943,19 +912,19 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/QuerySpanTreeResponse" + "$ref": "#/components/schemas/Turn" } } } } }, "tags": [ - "Telemetry" + "Agents" ], "description": "", "parameters": [ { - "name": "span_id", + "name": "agent_id", "in": "path", "required": true, "schema": { @@ -963,28 +932,25 @@ } }, { - "name": "attributes_to_return", - "in": "query", - "required": false, + "name": "session_id", + "in": "path", + "required": true, "schema": { - "type": "array", - "items": { - "type": "string" - } + "type": "string" } }, { - "name": "max_depth", - "in": "query", - "required": false, + "name": "turn_id", + "in": "path", + "required": true, "schema": { - "type": "integer" + "type": "string" } } ] } }, - "/v1/tools/{tool_name}": { + "/v1/eval/benchmarks/{benchmark_id}": { "get": { "responses": { "200": { @@ -992,19 +958,26 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/Tool" + "oneOf": [ + { + "$ref": "#/components/schemas/Benchmark" + }, + { + "type": "null" + } + ] } } } } }, "tags": [ - "ToolGroups" + "Benchmarks" ], "description": "", "parameters": [ { - "name": "tool_name", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -1014,7 +987,7 @@ ] } }, - "/v1/toolgroups/{toolgroup_id}": { + "/v1/datasets/{dataset_id}": { "get": { "responses": { "200": { @@ -1022,19 +995,26 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ToolGroup" + "oneOf": [ + { + "$ref": "#/components/schemas/Dataset" + }, + { + "type": "null" + } + ] } } } } }, "tags": [ - "ToolGroups" + "Datasets" ], "description": "", "parameters": [ { - "name": "toolgroup_id", + "name": "dataset_id", "in": "path", "required": true, "schema": { @@ -1050,12 +1030,12 @@ } }, "tags": [ - "ToolGroups" + "Datasets" ], - "description": "Unregister a tool group", + "description": "", "parameters": [ { - "name": "toolgroup_id", + "name": "dataset_id", "in": "path", "required": true, "schema": { @@ -1065,7 +1045,7 @@ ] } }, - "/v1/telemetry/traces/{trace_id}": { + "/v1/models/{model_id}": { "get": { "responses": { "200": { @@ -1073,19 +1053,47 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/Trace" + "oneOf": [ + { + "$ref": "#/components/schemas/Model" + }, + { + "type": "null" + } + ] } } } } }, "tags": [ - "Telemetry" + "Models" ], "description": "", "parameters": [ { - "name": "trace_id", + "name": "model_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ] + }, + "delete": { + "responses": { + "200": { + "description": "OK" + } + }, + "tags": [ + "Models" + ], + "description": "", + "parameters": [ + { + "name": "model_id", "in": "path", "required": true, "schema": { @@ -1095,7 +1103,7 @@ ] } }, - "/v1/post-training/job/artifacts": { + "/v1/scoring-functions/{scoring_fn_id}": { "get": { "responses": { "200": { @@ -1105,7 +1113,7 @@ "schema": { "oneOf": [ { - "$ref": "#/components/schemas/PostTrainingJobArtifactsResponse" + "$ref": "#/components/schemas/ScoringFn" }, { "type": "null" @@ -1117,13 +1125,13 @@ } }, "tags": [ - "PostTraining (Coming Soon)" + "ScoringFunctions" ], "description": "", "parameters": [ { - "name": "job_uuid", - "in": "query", + "name": "scoring_fn_id", + "in": "path", "required": true, "schema": { "type": "string" @@ -1132,7 +1140,7 @@ ] } }, - "/v1/post-training/job/status": { + "/v1/shields/{identifier}": { "get": { "responses": { "200": { @@ -1142,7 +1150,7 @@ "schema": { "oneOf": [ { - "$ref": "#/components/schemas/PostTrainingJobStatusResponse" + "$ref": "#/components/schemas/Shield" }, { "type": "null" @@ -1154,13 +1162,13 @@ } }, "tags": [ - "PostTraining (Coming Soon)" + "Shields" ], "description": "", "parameters": [ { - "name": "job_uuid", - "in": "query", + "name": "identifier", + "in": "path", "required": true, "schema": { "type": "string" @@ -1169,7 +1177,7 @@ ] } }, - "/v1/post-training/jobs": { + "/v1/telemetry/traces/{trace_id}/spans/{span_id}": { "get": { "responses": { "200": { @@ -1177,20 +1185,37 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ListPostTrainingJobsResponse" + "$ref": "#/components/schemas/Span" } } } } }, "tags": [ - "PostTraining (Coming Soon)" + "Telemetry" ], "description": "", - "parameters": [] + "parameters": [ + { + "name": "trace_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "span_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ] } }, - "/v1/vector-dbs/{vector_db_id}": { + "/v1/telemetry/spans/{span_id}/tree": { "get": { "responses": { "200": { @@ -1198,47 +1223,68 @@ "content": { "application/json": { "schema": { - "oneOf": [ - { - "$ref": "#/components/schemas/VectorDB" - }, - { - "type": "null" - } - ] + "$ref": "#/components/schemas/QuerySpanTreeResponse" } } } } }, "tags": [ - "VectorDBs" + "Telemetry" ], "description": "", "parameters": [ { - "name": "vector_db_id", + "name": "span_id", "in": "path", "required": true, "schema": { "type": "string" } + }, + { + "name": "attributes_to_return", + "in": "query", + "required": false, + "schema": { + "type": "array", + "items": { + "type": "string" + } + } + }, + { + "name": "max_depth", + "in": "query", + "required": false, + "schema": { + "type": "integer" + } } ] - }, - "delete": { + } + }, + "/v1/tools/{tool_name}": { + "get": { "responses": { "200": { - "description": "OK" + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Tool" + } + } + } } }, "tags": [ - "VectorDBs" + "ToolGroups" ], "description": "", "parameters": [ { - "name": "vector_db_id", + "name": "tool_name", "in": "path", "required": true, "schema": { @@ -1248,7 +1294,7 @@ ] } }, - "/v1/health": { + "/v1/toolgroups/{toolgroup_id}": { "get": { "responses": { "200": { @@ -1256,69 +1302,303 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/HealthInfo" + "$ref": "#/components/schemas/ToolGroup" } } } } }, "tags": [ - "Inspect" + "ToolGroups" ], "description": "", - "parameters": [] - } - }, - "/v1/tool-runtime/rag-tool/insert": { - "post": { + "parameters": [ + { + "name": "toolgroup_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ] + }, + "delete": { "responses": { "200": { "description": "OK" } }, "tags": [ - "ToolRuntime" + "ToolGroups" ], - "description": "Index documents so they can be used by the RAG system", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/InsertRequest" - } + "description": "Unregister a tool group", + "parameters": [ + { + "name": "toolgroup_id", + "in": "path", + "required": true, + "schema": { + "type": "string" } - }, - "required": true - } + } + ] } }, - "/v1/vector-io/insert": { - "post": { + "/v1/telemetry/traces/{trace_id}": { + "get": { "responses": { "200": { - "description": "OK" + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Trace" + } + } + } } }, "tags": [ - "VectorIO" + "Telemetry" ], "description": "", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/InsertChunksRequest" - } + "parameters": [ + { + "name": "trace_id", + "in": "path", + "required": true, + "schema": { + "type": "string" } - }, - "required": true - } + } + ] } }, - "/v1/tool-runtime/invoke": { - "post": { + "/v1/post-training/job/artifacts": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "oneOf": [ + { + "$ref": "#/components/schemas/PostTrainingJobArtifactsResponse" + }, + { + "type": "null" + } + ] + } + } + } + } + }, + "tags": [ + "PostTraining (Coming Soon)" + ], + "description": "", + "parameters": [ + { + "name": "job_uuid", + "in": "query", + "required": true, + "schema": { + "type": "string" + } + } + ] + } + }, + "/v1/post-training/job/status": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "oneOf": [ + { + "$ref": "#/components/schemas/PostTrainingJobStatusResponse" + }, + { + "type": "null" + } + ] + } + } + } + } + }, + "tags": [ + "PostTraining (Coming Soon)" + ], + "description": "", + "parameters": [ + { + "name": "job_uuid", + "in": "query", + "required": true, + "schema": { + "type": "string" + } + } + ] + } + }, + "/v1/post-training/jobs": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListPostTrainingJobsResponse" + } + } + } + } + }, + "tags": [ + "PostTraining (Coming Soon)" + ], + "description": "", + "parameters": [] + } + }, + "/v1/vector-dbs/{vector_db_id}": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "oneOf": [ + { + "$ref": "#/components/schemas/VectorDB" + }, + { + "type": "null" + } + ] + } + } + } + } + }, + "tags": [ + "VectorDBs" + ], + "description": "", + "parameters": [ + { + "name": "vector_db_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ] + }, + "delete": { + "responses": { + "200": { + "description": "OK" + } + }, + "tags": [ + "VectorDBs" + ], + "description": "", + "parameters": [ + { + "name": "vector_db_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ] + } + }, + "/v1/health": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HealthInfo" + } + } + } + } + }, + "tags": [ + "Inspect" + ], + "description": "", + "parameters": [] + } + }, + "/v1/tool-runtime/rag-tool/insert": { + "post": { + "responses": { + "200": { + "description": "OK" + } + }, + "tags": [ + "ToolRuntime" + ], + "description": "Index documents so they can be used by the RAG system", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/InsertRequest" + } + } + }, + "required": true + } + } + }, + "/v1/vector-io/insert": { + "post": { + "responses": { + "200": { + "description": "OK" + } + }, + "tags": [ + "VectorIO" + ], + "description": "", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/InsertChunksRequest" + } + } + }, + "required": true + } + } + }, + "/v1/tool-runtime/invoke": { + "post": { "responses": { "200": { "description": "OK", @@ -1348,7 +1628,7 @@ } } }, - "/v1/eval/tasks/{task_id}/jobs/{job_id}": { + "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": { "get": { "responses": { "200": { @@ -1375,7 +1655,7 @@ "description": "", "parameters": [ { - "name": "task_id", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -1404,7 +1684,7 @@ "description": "", "parameters": [ { - "name": "task_id", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -1422,7 +1702,7 @@ ] } }, - "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": { + "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": { "get": { "responses": { "200": { @@ -1442,7 +1722,7 @@ "description": "", "parameters": [ { - "name": "job_id", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -1450,7 +1730,7 @@ } }, { - "name": "task_id", + "name": "job_id", "in": "path", "required": true, "schema": { @@ -1460,7 +1740,7 @@ ] } }, - "/v1/datasets": { + "/v1/eval/benchmarks": { "get": { "responses": { "200": { @@ -1468,14 +1748,14 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ListDatasetsResponse" + "$ref": "#/components/schemas/ListBenchmarksResponse" } } } } }, "tags": [ - "Datasets" + "Benchmarks" ], "description": "", "parameters": [] @@ -1487,7 +1767,7 @@ } }, "tags": [ - "Datasets" + "Benchmarks" ], "description": "", "parameters": [], @@ -1495,7 +1775,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/RegisterDatasetRequest" + "$ref": "#/components/schemas/RegisterBenchmarkRequest" } } }, @@ -1503,7 +1783,7 @@ } } }, - "/v1/eval-tasks": { + "/v1/datasets": { "get": { "responses": { "200": { @@ -1511,14 +1791,14 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ListEvalTasksResponse" + "$ref": "#/components/schemas/ListDatasetsResponse" } } } } }, "tags": [ - "EvalTasks" + "Datasets" ], "description": "", "parameters": [] @@ -1530,7 +1810,7 @@ } }, "tags": [ - "EvalTasks" + "Datasets" ], "description": "", "parameters": [], @@ -1538,7 +1818,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/RegisterEvalTaskRequest" + "$ref": "#/components/schemas/RegisterDatasetRequest" } } }, @@ -2121,7 +2401,7 @@ ] } }, - "/v1/eval/tasks/{task_id}/jobs": { + "/v1/eval/benchmarks/{benchmark_id}/jobs": { "post": { "responses": { "200": { @@ -2141,7 +2421,7 @@ "description": "", "parameters": [ { - "name": "task_id", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -2365,84 +2645,216 @@ "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema", "components": { "schemas": { - "AppendRowsRequest": { + "AgentCandidate": { "type": "object", "properties": { - "dataset_id": { - "type": "string" + "type": { + "type": "string", + "const": "agent", + "default": "agent" }, - "rows": { - "type": "array", - "items": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - } + "config": { + "$ref": "#/components/schemas/AgentConfig" } }, "additionalProperties": false, "required": [ - "dataset_id", - "rows" + "type", + "config" ] }, - "CompletionMessage": { + "AgentConfig": { "type": "object", "properties": { - "role": { - "type": "string", - "const": "assistant", - "default": "assistant", - "description": "Must be \"assistant\" to identify this as the model's response" + "sampling_params": { + "$ref": "#/components/schemas/SamplingParams" }, - "content": { - "$ref": "#/components/schemas/InterleavedContent", - "description": "The content of the model's response" + "input_shields": { + "type": "array", + "items": { + "type": "string" + } }, - "stop_reason": { + "output_shields": { + "type": "array", + "items": { + "type": "string" + } + }, + "toolgroups": { + "type": "array", + "items": { + "$ref": "#/components/schemas/AgentTool" + } + }, + "client_tools": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ToolDef" + } + }, + "tool_choice": { "type": "string", "enum": [ - "end_of_turn", - "end_of_message", - "out_of_tokens" + "auto", + "required" ], - "description": "Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: The model finished generating the entire response. - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response. - `StopReason.out_of_tokens`: The model ran out of token budget." + "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model." }, - "tool_calls": { + "tool_prompt_format": { + "type": "string", + "enum": [ + "json", + "function_tag", + "python_list" + ], + "description": "Prompt format for calling custom / zero shot tools." + }, + "tool_config": { + "$ref": "#/components/schemas/ToolConfig" + }, + "max_infer_iters": { + "type": "integer", + "default": 10 + }, + "model": { + "type": "string" + }, + "instructions": { + "type": "string" + }, + "enable_session_persistence": { + "type": "boolean" + }, + "response_format": { + "$ref": "#/components/schemas/ResponseFormat" + } + }, + "additionalProperties": false, + "required": [ + "model", + "instructions", + "enable_session_persistence" + ] + }, + "AgentTool": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "args": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "name", + "args" + ] + } + ] + }, + "AggregationFunctionType": { + "type": "string", + "enum": [ + "average", + "median", + "categorical_count", + "accuracy" + ] + }, + "BasicScoringFnParams": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "basic", + "default": "basic" + }, + "aggregation_functions": { "type": "array", "items": { - "$ref": "#/components/schemas/ToolCall" - }, - "description": "List of tool calls. Each tool call is a ToolCall object." + "$ref": "#/components/schemas/AggregationFunctionType" + } } }, "additionalProperties": false, "required": [ - "role", - "content", - "stop_reason" + "type" + ] + }, + "BenchmarkConfig": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "benchmark", + "default": "benchmark" + }, + "eval_candidate": { + "$ref": "#/components/schemas/EvalCandidate" + }, + "scoring_params": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ScoringFnParams" + } + }, + "num_examples": { + "type": "integer" + } + }, + "additionalProperties": false, + "required": [ + "type", + "eval_candidate", + "scoring_params" + ] + }, + "EvalCandidate": { + "oneOf": [ + { + "$ref": "#/components/schemas/ModelCandidate" + }, + { + "$ref": "#/components/schemas/AgentCandidate" + } ], - "description": "A message containing the model's (assistant) response in a chat conversation." + "discriminator": { + "propertyName": "type", + "mapping": { + "model": "#/components/schemas/ModelCandidate", + "agent": "#/components/schemas/AgentCandidate" + } + } }, "GrammarResponseFormat": { "type": "object", @@ -2610,30 +3022,89 @@ ], "description": "Configuration for JSON schema-guided response generation." }, - "Message": { - "oneOf": [ - { - "$ref": "#/components/schemas/UserMessage" + "LLMAsJudgeScoringFnParams": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm_as_judge", + "default": "llm_as_judge" }, - { - "$ref": "#/components/schemas/SystemMessage" + "judge_model": { + "type": "string" }, - { - "$ref": "#/components/schemas/ToolResponseMessage" + "prompt_template": { + "type": "string" }, - { - "$ref": "#/components/schemas/CompletionMessage" - } - ], - "discriminator": { - "propertyName": "role", - "mapping": { - "user": "#/components/schemas/UserMessage", - "system": "#/components/schemas/SystemMessage", - "tool": "#/components/schemas/ToolResponseMessage", - "assistant": "#/components/schemas/CompletionMessage" - } - } + "judge_score_regexes": { + "type": "array", + "items": { + "type": "string" + } + }, + "aggregation_functions": { + "type": "array", + "items": { + "$ref": "#/components/schemas/AggregationFunctionType" + } + } + }, + "additionalProperties": false, + "required": [ + "type", + "judge_model" + ] + }, + "ModelCandidate": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "model", + "default": "model" + }, + "model": { + "type": "string" + }, + "sampling_params": { + "$ref": "#/components/schemas/SamplingParams" + }, + "system_message": { + "$ref": "#/components/schemas/SystemMessage" + } + }, + "additionalProperties": false, + "required": [ + "type", + "model", + "sampling_params" + ] + }, + "RegexParserScoringFnParams": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "regex_parser", + "default": "regex_parser" + }, + "parsing_regexes": { + "type": "array", + "items": { + "type": "string" + } + }, + "aggregation_functions": { + "type": "array", + "items": { + "$ref": "#/components/schemas/AggregationFunctionType" + } + } + }, + "additionalProperties": false, + "required": [ + "type" + ] }, "ResponseFormat": { "oneOf": [ @@ -2693,6 +3164,27 @@ } } }, + "ScoringFnParams": { + "oneOf": [ + { + "$ref": "#/components/schemas/LLMAsJudgeScoringFnParams" + }, + { + "$ref": "#/components/schemas/RegexParserScoringFnParams" + }, + { + "$ref": "#/components/schemas/BasicScoringFnParams" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams", + "regex_parser": "#/components/schemas/RegexParserScoringFnParams", + "basic": "#/components/schemas/BasicScoringFnParams" + } + } + }, "SystemMessage": { "type": "object", "properties": { @@ -2735,90 +3227,79 @@ ], "description": "A text content item" }, - "ToolCall": { + "ToolConfig": { "type": "object", "properties": { - "call_id": { + "tool_choice": { + "type": "string", + "enum": [ + "auto", + "required" + ], + "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.", + "default": "auto" + }, + "tool_prompt_format": { + "type": "string", + "enum": [ + "json", + "function_tag", + "python_list" + ], + "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls." + }, + "system_message_behavior": { + "type": "string", + "enum": [ + "append", + "replace" + ], + "description": "(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string '{{function_definitions}}' to indicate where the function definitions should be inserted.", + "default": "append" + } + }, + "additionalProperties": false, + "required": [ + "system_message_behavior" + ], + "description": "Configuration for tool use." + }, + "ToolDef": { + "type": "object", + "properties": { + "name": { "type": "string" }, - "tool_name": { - "oneOf": [ - { - "type": "string", - "enum": [ - "brave_search", - "wolfram_alpha", - "photogen", - "code_interpreter" - ] - }, - { - "type": "string" - } - ] + "description": { + "type": "string" }, - "arguments": { + "parameters": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ToolParameter" + } + }, + "metadata": { "type": "object", "additionalProperties": { "oneOf": [ { - "type": "string" + "type": "null" }, { - "type": "integer" + "type": "boolean" }, { "type": "number" }, { - "type": "boolean" - }, - { - "type": "null" + "type": "string" }, { - "type": "array", - "items": { - "oneOf": [ - { - "type": "string" - }, - { - "type": "integer" - }, - { - "type": "number" - }, - { - "type": "boolean" - }, - { - "type": "null" - } - ] - } + "type": "array" }, { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "string" - }, - { - "type": "integer" - }, - { - "type": "number" - }, - { - "type": "boolean" - }, - { - "type": "null" - } - ] - } + "type": "object" } ] } @@ -2826,49 +3307,16 @@ }, "additionalProperties": false, "required": [ - "call_id", - "tool_name", - "arguments" + "name" ] }, - "ToolDefinition": { + "ToolParameter": { "type": "object", "properties": { - "tool_name": { - "oneOf": [ - { - "type": "string", - "enum": [ - "brave_search", - "wolfram_alpha", - "photogen", - "code_interpreter" - ] - }, - { - "type": "string" - } - ] - }, - "description": { + "name": { "type": "string" }, - "parameters": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ToolParamDefinition" - } - } - }, - "additionalProperties": false, - "required": [ - "tool_name" - ] - }, - "ToolParamDefinition": { - "type": "object", - "properties": { - "param_type": { + "parameter_type": { "type": "string" }, "description": { @@ -2903,54 +3351,13 @@ }, "additionalProperties": false, "required": [ - "param_type" + "name", + "parameter_type", + "description", + "required" ] }, - "ToolResponseMessage": { - "type": "object", - "properties": { - "role": { - "type": "string", - "const": "tool", - "default": "tool", - "description": "Must be \"tool\" to identify this as a tool response" - }, - "call_id": { - "type": "string", - "description": "Unique identifier for the tool call this response is for" - }, - "tool_name": { - "oneOf": [ - { - "type": "string", - "enum": [ - "brave_search", - "wolfram_alpha", - "photogen", - "code_interpreter" - ] - }, - { - "type": "string" - } - ], - "description": "Name of the tool that was called" - }, - "content": { - "$ref": "#/components/schemas/InterleavedContent", - "description": "The response content from the tool" - } - }, - "additionalProperties": false, - "required": [ - "role", - "call_id", - "tool_name", - "content" - ], - "description": "A message representing the result of a tool invocation." - }, - "TopKSamplingStrategy": { + "TopKSamplingStrategy": { "type": "object", "properties": { "type": { @@ -3001,277 +3408,379 @@ "uri" ] }, - "UserMessage": { - "type": "object", - "properties": { - "role": { - "type": "string", - "const": "user", - "default": "user", - "description": "Must be \"user\" to identify this as a user message" - }, - "content": { - "$ref": "#/components/schemas/InterleavedContent", - "description": "The content of the message, which can include text and other media" - }, - "context": { - "$ref": "#/components/schemas/InterleavedContent", - "description": "(Optional) This field is used internally by Llama Stack to pass RAG context. This field may be removed in the API in the future." - } - }, - "additionalProperties": false, - "required": [ - "role", - "content" - ], - "description": "A message from the user in a chat conversation." - }, - "BatchChatCompletionRequest": { + "DeprecatedEvaluateRowsRequest": { "type": "object", "properties": { - "model": { - "type": "string" - }, - "messages_batch": { + "input_rows": { "type": "array", "items": { - "type": "array", - "items": { - "$ref": "#/components/schemas/Message" + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] } } }, - "sampling_params": { - "$ref": "#/components/schemas/SamplingParams" - }, - "tools": { + "scoring_functions": { "type": "array", "items": { - "$ref": "#/components/schemas/ToolDefinition" + "type": "string" } }, - "tool_choice": { - "type": "string", - "enum": [ - "auto", - "required" - ], - "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model." - }, - "tool_prompt_format": { - "type": "string", - "enum": [ - "json", - "function_tag", - "python_list" - ], - "description": "Prompt format for calling custom / zero shot tools." - }, - "response_format": { - "$ref": "#/components/schemas/ResponseFormat" - }, - "logprobs": { - "type": "object", - "properties": { - "top_k": { - "type": "integer", - "default": 0, - "description": "How many tokens (for each position) to return log probabilities for." - } - }, - "additionalProperties": false + "task_config": { + "$ref": "#/components/schemas/BenchmarkConfig" } }, "additionalProperties": false, "required": [ - "model", - "messages_batch" + "input_rows", + "scoring_functions", + "task_config" ] }, - "BatchChatCompletionResponse": { + "EvaluateResponse": { "type": "object", "properties": { - "batch": { + "generations": { "type": "array", "items": { - "$ref": "#/components/schemas/ChatCompletionResponse" + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "scores": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ScoringResult" } } }, "additionalProperties": false, "required": [ - "batch" + "generations", + "scores" ] }, - "ChatCompletionResponse": { + "ScoringResult": { "type": "object", "properties": { - "metrics": { + "score_rows": { "type": "array", "items": { - "$ref": "#/components/schemas/MetricEvent" + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } } }, - "completion_message": { - "$ref": "#/components/schemas/CompletionMessage", - "description": "The complete response message" - }, - "logprobs": { - "type": "array", - "items": { - "$ref": "#/components/schemas/TokenLogProbs" - }, - "description": "Optional log probabilities for generated tokens" - } - }, - "additionalProperties": false, - "required": [ - "completion_message" - ], - "description": "Response from a chat completion request." - }, - "MetricEvent": { - "type": "object", - "properties": { - "trace_id": { - "type": "string" - }, - "span_id": { - "type": "string" - }, - "timestamp": { - "type": "string", - "format": "date-time" - }, - "attributes": { + "aggregated_results": { "type": "object", "additionalProperties": { "oneOf": [ { - "type": "string" + "type": "null" }, { - "type": "integer" + "type": "boolean" }, { "type": "number" }, { - "type": "boolean" + "type": "string" }, { - "type": "null" + "type": "array" + }, + { + "type": "object" } ] } + } + }, + "additionalProperties": false, + "required": [ + "score_rows", + "aggregated_results" + ] + }, + "Benchmark": { + "type": "object", + "properties": { + "identifier": { + "type": "string" }, - "type": { - "type": "string", - "const": "metric", - "default": "metric" - }, - "metric": { + "provider_resource_id": { "type": "string" }, - "value": { - "oneOf": [ - { - "type": "integer" - }, - { - "type": "number" - } - ] + "provider_id": { + "type": "string" }, - "unit": { + "type": { + "type": "string", + "const": "benchmark", + "default": "benchmark" + }, + "dataset_id": { "type": "string" + }, + "scoring_functions": { + "type": "array", + "items": { + "type": "string" + } + }, + "metadata": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } } }, "additionalProperties": false, "required": [ - "trace_id", - "span_id", - "timestamp", + "identifier", + "provider_resource_id", + "provider_id", "type", - "metric", - "value", - "unit" + "dataset_id", + "scoring_functions", + "metadata" ] }, - "TokenLogProbs": { + "JobStatus": { + "type": "string", + "enum": [ + "completed", + "in_progress", + "failed", + "scheduled" + ] + }, + "ListBenchmarksResponse": { "type": "object", "properties": { - "logprobs_by_token": { - "type": "object", - "additionalProperties": { - "type": "number" - }, - "description": "Dictionary mapping tokens to their log probabilities" + "data": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Benchmark" + } } }, "additionalProperties": false, "required": [ - "logprobs_by_token" - ], - "description": "Log probabilities for generated tokens." + "data" + ] }, - "BatchCompletionRequest": { + "DeprecatedRegisterEvalTaskRequest": { "type": "object", "properties": { - "model": { + "eval_task_id": { "type": "string" }, - "content_batch": { + "dataset_id": { + "type": "string" + }, + "scoring_functions": { "type": "array", "items": { - "$ref": "#/components/schemas/InterleavedContent" + "type": "string" } }, - "sampling_params": { - "$ref": "#/components/schemas/SamplingParams" + "provider_benchmark_id": { + "type": "string" }, - "response_format": { - "$ref": "#/components/schemas/ResponseFormat" + "provider_id": { + "type": "string" }, - "logprobs": { + "metadata": { "type": "object", - "properties": { - "top_k": { - "type": "integer", - "default": 0, - "description": "How many tokens (for each position) to return log probabilities for." - } - }, - "additionalProperties": false + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } } }, "additionalProperties": false, "required": [ - "model", - "content_batch" + "eval_task_id", + "dataset_id", + "scoring_functions" ] }, - "BatchCompletionResponse": { + "DeprecatedRunEvalRequest": { "type": "object", "properties": { - "batch": { + "task_config": { + "$ref": "#/components/schemas/BenchmarkConfig" + } + }, + "additionalProperties": false, + "required": [ + "task_config" + ] + }, + "Job": { + "type": "object", + "properties": { + "job_id": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "job_id" + ] + }, + "AppendRowsRequest": { + "type": "object", + "properties": { + "dataset_id": { + "type": "string" + }, + "rows": { "type": "array", "items": { - "$ref": "#/components/schemas/CompletionResponse" + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } } } }, "additionalProperties": false, "required": [ - "batch" + "dataset_id", + "rows" ] }, - "CompletionResponse": { + "CompletionMessage": { "type": "object", "properties": { - "content": { + "role": { "type": "string", - "description": "The generated completion text" + "const": "assistant", + "default": "assistant", + "description": "Must be \"assistant\" to identify this as the model's response" + }, + "content": { + "$ref": "#/components/schemas/InterleavedContent", + "description": "The content of the model's response" }, "stop_reason": { "type": "string", @@ -3280,398 +3789,311 @@ "end_of_message", "out_of_tokens" ], - "description": "Reason why generation stopped" + "description": "Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: The model finished generating the entire response. - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response. - `StopReason.out_of_tokens`: The model ran out of token budget." }, - "logprobs": { + "tool_calls": { "type": "array", "items": { - "$ref": "#/components/schemas/TokenLogProbs" + "$ref": "#/components/schemas/ToolCall" }, - "description": "Optional log probabilities for generated tokens" + "description": "List of tool calls. Each tool call is a ToolCall object." } }, "additionalProperties": false, "required": [ + "role", "content", "stop_reason" ], - "description": "Response from a completion request." + "description": "A message containing the model's (assistant) response in a chat conversation." }, - "CancelTrainingJobRequest": { - "type": "object", - "properties": { - "job_uuid": { - "type": "string" + "Message": { + "oneOf": [ + { + "$ref": "#/components/schemas/UserMessage" + }, + { + "$ref": "#/components/schemas/SystemMessage" + }, + { + "$ref": "#/components/schemas/ToolResponseMessage" + }, + { + "$ref": "#/components/schemas/CompletionMessage" } - }, - "additionalProperties": false, - "required": [ - "job_uuid" - ] - }, - "ToolConfig": { - "type": "object", - "properties": { - "tool_choice": { - "type": "string", - "enum": [ - "auto", - "required" - ], - "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.", - "default": "auto" - }, - "tool_prompt_format": { - "type": "string", - "enum": [ - "json", - "function_tag", - "python_list" - ], - "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls." - }, - "system_message_behavior": { - "type": "string", - "enum": [ - "append", - "replace" - ], - "description": "(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string '{{function_definitions}}' to indicate where the function definitions should be inserted.", - "default": "append" - } - }, - "additionalProperties": false, - "required": [ - "system_message_behavior" ], - "description": "Configuration for tool use." + "discriminator": { + "propertyName": "role", + "mapping": { + "user": "#/components/schemas/UserMessage", + "system": "#/components/schemas/SystemMessage", + "tool": "#/components/schemas/ToolResponseMessage", + "assistant": "#/components/schemas/CompletionMessage" + } + } }, - "ChatCompletionRequest": { + "ToolCall": { "type": "object", "properties": { - "model_id": { - "type": "string", - "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint." - }, - "messages": { - "type": "array", - "items": { - "$ref": "#/components/schemas/Message" - }, - "description": "List of messages in the conversation" - }, - "sampling_params": { - "$ref": "#/components/schemas/SamplingParams", - "description": "Parameters to control the sampling strategy" - }, - "tools": { - "type": "array", - "items": { - "$ref": "#/components/schemas/ToolDefinition" - }, - "description": "(Optional) List of tool definitions available to the model" - }, - "tool_choice": { - "type": "string", - "enum": [ - "auto", - "required" - ], - "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead." - }, - "tool_prompt_format": { - "type": "string", - "enum": [ - "json", - "function_tag", - "python_list" - ], - "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls. .. deprecated:: Use tool_config instead." - }, - "response_format": { - "$ref": "#/components/schemas/ResponseFormat", - "description": "(Optional) Grammar specification for guided (structured) decoding. There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it." - }, - "stream": { - "type": "boolean", - "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False." + "call_id": { + "type": "string" }, - "logprobs": { - "type": "object", - "properties": { - "top_k": { - "type": "integer", - "default": 0, - "description": "How many tokens (for each position) to return log probabilities for." + "tool_name": { + "oneOf": [ + { + "type": "string", + "enum": [ + "brave_search", + "wolfram_alpha", + "photogen", + "code_interpreter" + ] + }, + { + "type": "string" } - }, - "additionalProperties": false, - "description": "(Optional) If specified, log probabilities for each token position will be returned." + ] }, - "tool_config": { - "$ref": "#/components/schemas/ToolConfig", - "description": "(Optional) Configuration for tool use." + "arguments": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + }, + { + "type": "null" + }, + { + "type": "array", + "items": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + }, + { + "type": "null" + } + ] + } + }, + { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + }, + { + "type": "null" + } + ] + } + } + ] + } } }, "additionalProperties": false, "required": [ - "model_id", - "messages" + "call_id", + "tool_name", + "arguments" ] }, - "ChatCompletionResponseEvent": { + "ToolDefinition": { "type": "object", "properties": { - "event_type": { - "type": "string", - "enum": [ - "start", - "complete", - "progress" - ], - "description": "Type of the event" - }, - "delta": { - "$ref": "#/components/schemas/ContentDelta", - "description": "Content generated since last event. This can be one or more tokens, or a tool call." + "tool_name": { + "oneOf": [ + { + "type": "string", + "enum": [ + "brave_search", + "wolfram_alpha", + "photogen", + "code_interpreter" + ] + }, + { + "type": "string" + } + ] }, - "logprobs": { - "type": "array", - "items": { - "$ref": "#/components/schemas/TokenLogProbs" - }, - "description": "Optional log probabilities for generated tokens" + "description": { + "type": "string" }, - "stop_reason": { - "type": "string", - "enum": [ - "end_of_turn", - "end_of_message", - "out_of_tokens" - ], - "description": "Optional reason why generation stopped, if complete" - } - }, - "additionalProperties": false, - "required": [ - "event_type", - "delta" - ], - "description": "An event during chat completion generation." - }, - "ChatCompletionResponseStreamChunk": { - "type": "object", - "properties": { - "metrics": { - "type": "array", - "items": { - "$ref": "#/components/schemas/MetricEvent" + "parameters": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ToolParamDefinition" } - }, - "event": { - "$ref": "#/components/schemas/ChatCompletionResponseEvent", - "description": "The event containing the new content" } }, "additionalProperties": false, "required": [ - "event" - ], - "description": "A chunk of a streamed chat completion response." - }, - "ContentDelta": { - "oneOf": [ - { - "$ref": "#/components/schemas/TextDelta" - }, - { - "$ref": "#/components/schemas/ImageDelta" - }, - { - "$ref": "#/components/schemas/ToolCallDelta" - } - ], - "discriminator": { - "propertyName": "type", - "mapping": { - "text": "#/components/schemas/TextDelta", - "image": "#/components/schemas/ImageDelta", - "tool_call": "#/components/schemas/ToolCallDelta" - } - } - }, - "ImageDelta": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "image", - "default": "image" - }, - "image": { - "type": "string", - "contentEncoding": "base64" - } - }, - "additionalProperties": false, - "required": [ - "type", - "image" + "tool_name" ] }, - "TextDelta": { + "ToolParamDefinition": { "type": "object", "properties": { - "type": { - "type": "string", - "const": "text", - "default": "text" + "param_type": { + "type": "string" }, - "text": { + "description": { "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "type", - "text" - ] - }, - "ToolCallDelta": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "tool_call", - "default": "tool_call" }, - "tool_call": { + "required": { + "type": "boolean", + "default": true + }, + "default": { "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, { "type": "string" }, { - "$ref": "#/components/schemas/ToolCall" + "type": "array" + }, + { + "type": "object" } ] - }, - "parse_status": { - "type": "string", - "enum": [ - "started", - "in_progress", - "failed", - "succeeded" - ] } }, "additionalProperties": false, "required": [ - "type", - "tool_call", - "parse_status" + "param_type" ] }, - "CompletionRequest": { + "ToolResponseMessage": { "type": "object", "properties": { - "model_id": { + "role": { "type": "string", - "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint." - }, - "content": { - "$ref": "#/components/schemas/InterleavedContent", - "description": "The content to generate a completion for" - }, - "sampling_params": { - "$ref": "#/components/schemas/SamplingParams", - "description": "(Optional) Parameters to control the sampling strategy" - }, - "response_format": { - "$ref": "#/components/schemas/ResponseFormat", - "description": "(Optional) Grammar specification for guided (structured) decoding" + "const": "tool", + "default": "tool", + "description": "Must be \"tool\" to identify this as a tool response" }, - "stream": { - "type": "boolean", - "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False." + "call_id": { + "type": "string", + "description": "Unique identifier for the tool call this response is for" }, - "logprobs": { - "type": "object", - "properties": { - "top_k": { - "type": "integer", - "default": 0, - "description": "How many tokens (for each position) to return log probabilities for." + "tool_name": { + "oneOf": [ + { + "type": "string", + "enum": [ + "brave_search", + "wolfram_alpha", + "photogen", + "code_interpreter" + ] + }, + { + "type": "string" } - }, - "additionalProperties": false, - "description": "(Optional) If specified, log probabilities for each token position will be returned." + ], + "description": "Name of the tool that was called" + }, + "content": { + "$ref": "#/components/schemas/InterleavedContent", + "description": "The response content from the tool" } }, "additionalProperties": false, "required": [ - "model_id", + "role", + "call_id", + "tool_name", "content" - ] + ], + "description": "A message representing the result of a tool invocation." }, - "CompletionResponseStreamChunk": { + "UserMessage": { "type": "object", "properties": { - "delta": { + "role": { "type": "string", - "description": "New content generated since last chunk. This can be one or more tokens." + "const": "user", + "default": "user", + "description": "Must be \"user\" to identify this as a user message" }, - "stop_reason": { - "type": "string", - "enum": [ - "end_of_turn", - "end_of_message", - "out_of_tokens" - ], - "description": "Optional reason why generation stopped, if complete" + "content": { + "$ref": "#/components/schemas/InterleavedContent", + "description": "The content of the message, which can include text and other media" }, - "logprobs": { - "type": "array", - "items": { - "$ref": "#/components/schemas/TokenLogProbs" - }, - "description": "Optional log probabilities for generated tokens" + "context": { + "$ref": "#/components/schemas/InterleavedContent", + "description": "(Optional) This field is used internally by Llama Stack to pass RAG context. This field may be removed in the API in the future." } }, "additionalProperties": false, "required": [ - "delta" + "role", + "content" ], - "description": "A chunk of a streamed completion response." + "description": "A message from the user in a chat conversation." }, - "AgentConfig": { + "BatchChatCompletionRequest": { "type": "object", "properties": { - "sampling_params": { - "$ref": "#/components/schemas/SamplingParams" - }, - "input_shields": { - "type": "array", - "items": { - "type": "string" - } + "model": { + "type": "string" }, - "output_shields": { + "messages_batch": { "type": "array", "items": { - "type": "string" + "type": "array", + "items": { + "$ref": "#/components/schemas/Message" + } } }, - "toolgroups": { - "type": "array", - "items": { - "$ref": "#/components/schemas/AgentTool" - } + "sampling_params": { + "$ref": "#/components/schemas/SamplingParams" }, - "client_tools": { + "tools": { "type": "array", "items": { - "$ref": "#/components/schemas/ToolDef" + "$ref": "#/components/schemas/ToolDefinition" } }, "tool_choice": { @@ -3691,565 +4113,614 @@ ], "description": "Prompt format for calling custom / zero shot tools." }, - "tool_config": { - "$ref": "#/components/schemas/ToolConfig" - }, - "max_infer_iters": { - "type": "integer", - "default": 10 - }, - "model": { - "type": "string" - }, - "instructions": { - "type": "string" - }, - "enable_session_persistence": { - "type": "boolean" - }, "response_format": { "$ref": "#/components/schemas/ResponseFormat" + }, + "logprobs": { + "type": "object", + "properties": { + "top_k": { + "type": "integer", + "default": 0, + "description": "How many tokens (for each position) to return log probabilities for." + } + }, + "additionalProperties": false } }, "additionalProperties": false, "required": [ "model", - "instructions", - "enable_session_persistence" + "messages_batch" ] }, - "AgentTool": { - "oneOf": [ - { - "type": "string" + "BatchChatCompletionResponse": { + "type": "object", + "properties": { + "batch": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ChatCompletionResponse" + } + } + }, + "additionalProperties": false, + "required": [ + "batch" + ] + }, + "ChatCompletionResponse": { + "type": "object", + "properties": { + "metrics": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MetricEvent" + } }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "args": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - } + "completion_message": { + "$ref": "#/components/schemas/CompletionMessage", + "description": "The complete response message" + }, + "logprobs": { + "type": "array", + "items": { + "$ref": "#/components/schemas/TokenLogProbs" }, - "additionalProperties": false, - "required": [ - "name", - "args" - ] + "description": "Optional log probabilities for generated tokens" } - ] + }, + "additionalProperties": false, + "required": [ + "completion_message" + ], + "description": "Response from a chat completion request." }, - "ToolDef": { + "MetricEvent": { "type": "object", "properties": { - "name": { + "trace_id": { "type": "string" }, - "description": { + "span_id": { "type": "string" }, - "parameters": { - "type": "array", - "items": { - "$ref": "#/components/schemas/ToolParameter" - } + "timestamp": { + "type": "string", + "format": "date-time" }, - "metadata": { + "attributes": { "type": "object", "additionalProperties": { "oneOf": [ { - "type": "null" + "type": "string" }, { - "type": "boolean" + "type": "integer" }, { "type": "number" }, { - "type": "string" - }, - { - "type": "array" + "type": "boolean" }, { - "type": "object" + "type": "null" } ] } - } - }, - "additionalProperties": false, - "required": [ - "name" - ] - }, - "ToolParameter": { - "type": "object", - "properties": { - "name": { - "type": "string" }, - "parameter_type": { - "type": "string" + "type": { + "type": "string", + "const": "metric", + "default": "metric" }, - "description": { + "metric": { "type": "string" }, - "required": { - "type": "boolean", - "default": true - }, - "default": { + "value": { "oneOf": [ { - "type": "null" - }, - { - "type": "boolean" + "type": "integer" }, { "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" } ] + }, + "unit": { + "type": "string" } }, "additionalProperties": false, "required": [ - "name", - "parameter_type", - "description", - "required" + "trace_id", + "span_id", + "timestamp", + "type", + "metric", + "value", + "unit" ] }, - "CreateAgentRequest": { + "TokenLogProbs": { "type": "object", "properties": { - "agent_config": { - "$ref": "#/components/schemas/AgentConfig" + "logprobs_by_token": { + "type": "object", + "additionalProperties": { + "type": "number" + }, + "description": "Dictionary mapping tokens to their log probabilities" } }, "additionalProperties": false, "required": [ - "agent_config" - ] + "logprobs_by_token" + ], + "description": "Log probabilities for generated tokens." }, - "AgentCreateResponse": { + "BatchCompletionRequest": { "type": "object", "properties": { - "agent_id": { + "model": { "type": "string" + }, + "content_batch": { + "type": "array", + "items": { + "$ref": "#/components/schemas/InterleavedContent" + } + }, + "sampling_params": { + "$ref": "#/components/schemas/SamplingParams" + }, + "response_format": { + "$ref": "#/components/schemas/ResponseFormat" + }, + "logprobs": { + "type": "object", + "properties": { + "top_k": { + "type": "integer", + "default": 0, + "description": "How many tokens (for each position) to return log probabilities for." + } + }, + "additionalProperties": false } }, "additionalProperties": false, "required": [ - "agent_id" + "model", + "content_batch" ] }, - "CreateAgentSessionRequest": { + "BatchCompletionResponse": { + "type": "object", + "properties": { + "batch": { + "type": "array", + "items": { + "$ref": "#/components/schemas/CompletionResponse" + } + } + }, + "additionalProperties": false, + "required": [ + "batch" + ] + }, + "CompletionResponse": { "type": "object", "properties": { - "session_name": { - "type": "string" + "content": { + "type": "string", + "description": "The generated completion text" + }, + "stop_reason": { + "type": "string", + "enum": [ + "end_of_turn", + "end_of_message", + "out_of_tokens" + ], + "description": "Reason why generation stopped" + }, + "logprobs": { + "type": "array", + "items": { + "$ref": "#/components/schemas/TokenLogProbs" + }, + "description": "Optional log probabilities for generated tokens" } }, "additionalProperties": false, "required": [ - "session_name" - ] + "content", + "stop_reason" + ], + "description": "Response from a completion request." }, - "AgentSessionCreateResponse": { + "CancelTrainingJobRequest": { "type": "object", "properties": { - "session_id": { + "job_uuid": { "type": "string" } }, "additionalProperties": false, "required": [ - "session_id" + "job_uuid" ] }, - "CreateAgentTurnRequest": { + "ChatCompletionRequest": { "type": "object", "properties": { + "model_id": { + "type": "string", + "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint." + }, "messages": { "type": "array", "items": { - "oneOf": [ - { - "$ref": "#/components/schemas/UserMessage" - }, - { - "$ref": "#/components/schemas/ToolResponseMessage" - } - ] - } + "$ref": "#/components/schemas/Message" + }, + "description": "List of messages in the conversation" }, - "stream": { - "type": "boolean" + "sampling_params": { + "$ref": "#/components/schemas/SamplingParams", + "description": "Parameters to control the sampling strategy" }, - "documents": { + "tools": { "type": "array", "items": { - "type": "object", - "properties": { - "content": { - "oneOf": [ - { - "type": "string" - }, - { - "$ref": "#/components/schemas/InterleavedContentItem" - }, - { - "type": "array", - "items": { - "$ref": "#/components/schemas/InterleavedContentItem" - } - }, - { - "$ref": "#/components/schemas/URL" - } - ] - }, - "mime_type": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "content", - "mime_type" - ] - } + "$ref": "#/components/schemas/ToolDefinition" + }, + "description": "(Optional) List of tool definitions available to the model" }, - "toolgroups": { - "type": "array", - "items": { - "$ref": "#/components/schemas/AgentTool" - } + "tool_choice": { + "type": "string", + "enum": [ + "auto", + "required" + ], + "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead." + }, + "tool_prompt_format": { + "type": "string", + "enum": [ + "json", + "function_tag", + "python_list" + ], + "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls. .. deprecated:: Use tool_config instead." + }, + "response_format": { + "$ref": "#/components/schemas/ResponseFormat", + "description": "(Optional) Grammar specification for guided (structured) decoding. There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it." + }, + "stream": { + "type": "boolean", + "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False." + }, + "logprobs": { + "type": "object", + "properties": { + "top_k": { + "type": "integer", + "default": 0, + "description": "How many tokens (for each position) to return log probabilities for." + } + }, + "additionalProperties": false, + "description": "(Optional) If specified, log probabilities for each token position will be returned." }, "tool_config": { - "$ref": "#/components/schemas/ToolConfig" + "$ref": "#/components/schemas/ToolConfig", + "description": "(Optional) Configuration for tool use." } }, "additionalProperties": false, "required": [ + "model_id", "messages" ] }, - "InferenceStep": { + "ChatCompletionResponseEvent": { "type": "object", "properties": { - "turn_id": { - "type": "string" - }, - "step_id": { - "type": "string" - }, - "started_at": { + "event_type": { "type": "string", - "format": "date-time" + "enum": [ + "start", + "complete", + "progress" + ], + "description": "Type of the event" }, - "completed_at": { - "type": "string", - "format": "date-time" + "delta": { + "$ref": "#/components/schemas/ContentDelta", + "description": "Content generated since last event. This can be one or more tokens, or a tool call." }, - "step_type": { - "type": "string", - "const": "inference", - "default": "inference" + "logprobs": { + "type": "array", + "items": { + "$ref": "#/components/schemas/TokenLogProbs" + }, + "description": "Optional log probabilities for generated tokens" }, - "model_response": { - "$ref": "#/components/schemas/CompletionMessage" + "stop_reason": { + "type": "string", + "enum": [ + "end_of_turn", + "end_of_message", + "out_of_tokens" + ], + "description": "Optional reason why generation stopped, if complete" } }, "additionalProperties": false, "required": [ - "turn_id", - "step_id", - "step_type", - "model_response" - ] + "event_type", + "delta" + ], + "description": "An event during chat completion generation." }, - "MemoryRetrievalStep": { + "ChatCompletionResponseStreamChunk": { "type": "object", "properties": { - "turn_id": { - "type": "string" + "metrics": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MetricEvent" + } }, - "step_id": { - "type": "string" + "event": { + "$ref": "#/components/schemas/ChatCompletionResponseEvent", + "description": "The event containing the new content" + } + }, + "additionalProperties": false, + "required": [ + "event" + ], + "description": "A chunk of a streamed chat completion response." + }, + "ContentDelta": { + "oneOf": [ + { + "$ref": "#/components/schemas/TextDelta" }, - "started_at": { - "type": "string", - "format": "date-time" + { + "$ref": "#/components/schemas/ImageDelta" }, - "completed_at": { + { + "$ref": "#/components/schemas/ToolCallDelta" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "text": "#/components/schemas/TextDelta", + "image": "#/components/schemas/ImageDelta", + "tool_call": "#/components/schemas/ToolCallDelta" + } + } + }, + "ImageDelta": { + "type": "object", + "properties": { + "type": { "type": "string", - "format": "date-time" + "const": "image", + "default": "image" }, - "step_type": { + "image": { + "type": "string", + "contentEncoding": "base64" + } + }, + "additionalProperties": false, + "required": [ + "type", + "image" + ] + }, + "TextDelta": { + "type": "object", + "properties": { + "type": { "type": "string", - "const": "memory_retrieval", - "default": "memory_retrieval" + "const": "text", + "default": "text" }, - "vector_db_ids": { + "text": { "type": "string" - }, - "inserted_context": { - "$ref": "#/components/schemas/InterleavedContent" } }, "additionalProperties": false, "required": [ - "turn_id", - "step_id", - "step_type", - "vector_db_ids", - "inserted_context" + "type", + "text" ] }, - "SafetyViolation": { + "ToolCallDelta": { "type": "object", "properties": { - "violation_level": { - "$ref": "#/components/schemas/ViolationLevel" + "type": { + "type": "string", + "const": "tool_call", + "default": "tool_call" }, - "user_message": { - "type": "string" + "tool_call": { + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/components/schemas/ToolCall" + } + ] }, - "metadata": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } + "parse_status": { + "type": "string", + "enum": [ + "started", + "in_progress", + "failed", + "succeeded" + ] } }, "additionalProperties": false, "required": [ - "violation_level", - "metadata" + "type", + "tool_call", + "parse_status" ] }, - "ShieldCallStep": { + "CompletionRequest": { "type": "object", "properties": { - "turn_id": { - "type": "string" + "model_id": { + "type": "string", + "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint." }, - "step_id": { - "type": "string" + "content": { + "$ref": "#/components/schemas/InterleavedContent", + "description": "The content to generate a completion for" }, - "started_at": { - "type": "string", - "format": "date-time" + "sampling_params": { + "$ref": "#/components/schemas/SamplingParams", + "description": "(Optional) Parameters to control the sampling strategy" }, - "completed_at": { - "type": "string", - "format": "date-time" + "response_format": { + "$ref": "#/components/schemas/ResponseFormat", + "description": "(Optional) Grammar specification for guided (structured) decoding" }, - "step_type": { - "type": "string", - "const": "shield_call", - "default": "shield_call" + "stream": { + "type": "boolean", + "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False." }, - "violation": { - "$ref": "#/components/schemas/SafetyViolation" + "logprobs": { + "type": "object", + "properties": { + "top_k": { + "type": "integer", + "default": 0, + "description": "How many tokens (for each position) to return log probabilities for." + } + }, + "additionalProperties": false, + "description": "(Optional) If specified, log probabilities for each token position will be returned." } }, "additionalProperties": false, "required": [ - "turn_id", - "step_id", - "step_type" + "model_id", + "content" ] }, - "ToolExecutionStep": { + "CompletionResponseStreamChunk": { "type": "object", "properties": { - "turn_id": { - "type": "string" - }, - "step_id": { - "type": "string" - }, - "started_at": { - "type": "string", - "format": "date-time" - }, - "completed_at": { + "delta": { "type": "string", - "format": "date-time" + "description": "New content generated since last chunk. This can be one or more tokens." }, - "step_type": { + "stop_reason": { "type": "string", - "const": "tool_execution", - "default": "tool_execution" - }, - "tool_calls": { - "type": "array", - "items": { - "$ref": "#/components/schemas/ToolCall" - } + "enum": [ + "end_of_turn", + "end_of_message", + "out_of_tokens" + ], + "description": "Optional reason why generation stopped, if complete" }, - "tool_responses": { + "logprobs": { "type": "array", "items": { - "$ref": "#/components/schemas/ToolResponse" - } + "$ref": "#/components/schemas/TokenLogProbs" + }, + "description": "Optional log probabilities for generated tokens" } }, "additionalProperties": false, "required": [ - "turn_id", - "step_id", - "step_type", - "tool_calls", - "tool_responses" + "delta" + ], + "description": "A chunk of a streamed completion response." + }, + "CreateAgentRequest": { + "type": "object", + "properties": { + "agent_config": { + "$ref": "#/components/schemas/AgentConfig" + } + }, + "additionalProperties": false, + "required": [ + "agent_config" ] }, - "ToolResponse": { + "AgentCreateResponse": { "type": "object", "properties": { - "call_id": { + "agent_id": { "type": "string" - }, - "tool_name": { - "oneOf": [ - { - "type": "string", - "enum": [ - "brave_search", - "wolfram_alpha", - "photogen", - "code_interpreter" - ] - }, - { - "type": "string" - } - ] - }, - "content": { - "$ref": "#/components/schemas/InterleavedContent" } }, "additionalProperties": false, "required": [ - "call_id", - "tool_name", - "content" + "agent_id" ] }, - "Turn": { + "CreateAgentSessionRequest": { "type": "object", "properties": { - "turn_id": { + "session_name": { "type": "string" - }, + } + }, + "additionalProperties": false, + "required": [ + "session_name" + ] + }, + "AgentSessionCreateResponse": { + "type": "object", + "properties": { "session_id": { "type": "string" - }, - "input_messages": { - "type": "array", - "items": { - "oneOf": [ - { - "$ref": "#/components/schemas/UserMessage" - }, - { - "$ref": "#/components/schemas/ToolResponseMessage" - } - ] - } - }, - "steps": { - "type": "array", - "items": { - "oneOf": [ - { - "$ref": "#/components/schemas/InferenceStep" - }, - { - "$ref": "#/components/schemas/ToolExecutionStep" - }, + } + }, + "additionalProperties": false, + "required": [ + "session_id" + ] + }, + "CreateAgentTurnRequest": { + "type": "object", + "properties": { + "messages": { + "type": "array", + "items": { + "oneOf": [ { - "$ref": "#/components/schemas/ShieldCallStep" + "$ref": "#/components/schemas/UserMessage" }, { - "$ref": "#/components/schemas/MemoryRetrievalStep" - } - ], - "discriminator": { - "propertyName": "step_type", - "mapping": { - "inference": "#/components/schemas/InferenceStep", - "tool_execution": "#/components/schemas/ToolExecutionStep", - "shield_call": "#/components/schemas/ShieldCallStep", - "memory_retrieval": "#/components/schemas/MemoryRetrievalStep" + "$ref": "#/components/schemas/ToolResponseMessage" } - } + ] } }, - "output_message": { - "$ref": "#/components/schemas/CompletionMessage" + "stream": { + "type": "boolean" }, - "output_attachments": { + "documents": { "type": "array", "items": { "type": "object", @@ -4284,179 +4755,100 @@ ] } }, - "started_at": { - "type": "string", - "format": "date-time" + "toolgroups": { + "type": "array", + "items": { + "$ref": "#/components/schemas/AgentTool" + } }, - "completed_at": { - "type": "string", - "format": "date-time" + "tool_config": { + "$ref": "#/components/schemas/ToolConfig" } }, "additionalProperties": false, "required": [ - "turn_id", - "session_id", - "input_messages", - "steps", - "output_message", - "started_at" - ], - "description": "A single turn in an interaction with an Agentic System." - }, - "ViolationLevel": { - "type": "string", - "enum": [ - "info", - "warn", - "error" + "messages" ] }, - "AgentTurnResponseEvent": { + "InferenceStep": { "type": "object", "properties": { - "payload": { - "$ref": "#/components/schemas/AgentTurnResponseEventPayload" - } - }, - "additionalProperties": false, - "required": [ - "payload" - ] - }, - "AgentTurnResponseEventPayload": { - "oneOf": [ - { - "$ref": "#/components/schemas/AgentTurnResponseStepStartPayload" - }, - { - "$ref": "#/components/schemas/AgentTurnResponseStepProgressPayload" + "turn_id": { + "type": "string" }, - { - "$ref": "#/components/schemas/AgentTurnResponseStepCompletePayload" + "step_id": { + "type": "string" }, - { - "$ref": "#/components/schemas/AgentTurnResponseTurnStartPayload" + "started_at": { + "type": "string", + "format": "date-time" }, - { - "$ref": "#/components/schemas/AgentTurnResponseTurnCompletePayload" - } - ], - "discriminator": { - "propertyName": "event_type", - "mapping": { - "step_start": "#/components/schemas/AgentTurnResponseStepStartPayload", - "step_progress": "#/components/schemas/AgentTurnResponseStepProgressPayload", - "step_complete": "#/components/schemas/AgentTurnResponseStepCompletePayload", - "turn_start": "#/components/schemas/AgentTurnResponseTurnStartPayload", - "turn_complete": "#/components/schemas/AgentTurnResponseTurnCompletePayload" - } - } - }, - "AgentTurnResponseStepCompletePayload": { - "type": "object", - "properties": { - "event_type": { + "completed_at": { "type": "string", - "const": "step_complete", - "default": "step_complete" + "format": "date-time" }, "step_type": { "type": "string", - "enum": [ - "inference", - "tool_execution", - "shield_call", - "memory_retrieval" - ] - }, - "step_id": { - "type": "string" + "const": "inference", + "default": "inference" }, - "step_details": { - "oneOf": [ - { - "$ref": "#/components/schemas/InferenceStep" - }, - { - "$ref": "#/components/schemas/ToolExecutionStep" - }, - { - "$ref": "#/components/schemas/ShieldCallStep" - }, - { - "$ref": "#/components/schemas/MemoryRetrievalStep" - } - ], - "discriminator": { - "propertyName": "step_type", - "mapping": { - "inference": "#/components/schemas/InferenceStep", - "tool_execution": "#/components/schemas/ToolExecutionStep", - "shield_call": "#/components/schemas/ShieldCallStep", - "memory_retrieval": "#/components/schemas/MemoryRetrievalStep" - } - } + "model_response": { + "$ref": "#/components/schemas/CompletionMessage" } }, "additionalProperties": false, "required": [ - "event_type", - "step_type", + "turn_id", "step_id", - "step_details" + "step_type", + "model_response" ] }, - "AgentTurnResponseStepProgressPayload": { + "MemoryRetrievalStep": { "type": "object", "properties": { - "event_type": { + "turn_id": { + "type": "string" + }, + "step_id": { + "type": "string" + }, + "started_at": { "type": "string", - "const": "step_progress", - "default": "step_progress" + "format": "date-time" + }, + "completed_at": { + "type": "string", + "format": "date-time" }, "step_type": { "type": "string", - "enum": [ - "inference", - "tool_execution", - "shield_call", - "memory_retrieval" - ] + "const": "memory_retrieval", + "default": "memory_retrieval" }, - "step_id": { + "vector_db_ids": { "type": "string" }, - "delta": { - "$ref": "#/components/schemas/ContentDelta" + "inserted_context": { + "$ref": "#/components/schemas/InterleavedContent" } }, "additionalProperties": false, "required": [ - "event_type", - "step_type", + "turn_id", "step_id", - "delta" + "step_type", + "vector_db_ids", + "inserted_context" ] }, - "AgentTurnResponseStepStartPayload": { + "SafetyViolation": { "type": "object", "properties": { - "event_type": { - "type": "string", - "const": "step_start", - "default": "step_start" - }, - "step_type": { - "type": "string", - "enum": [ - "inference", - "tool_execution", - "shield_call", - "memory_retrieval" - ] + "violation_level": { + "$ref": "#/components/schemas/ViolationLevel" }, - "step_id": { + "user_message": { "type": "string" }, "metadata": { @@ -4487,432 +4879,510 @@ }, "additionalProperties": false, "required": [ - "event_type", - "step_type", - "step_id" + "violation_level", + "metadata" ] }, - "AgentTurnResponseStreamChunk": { - "type": "object", - "properties": { - "event": { - "$ref": "#/components/schemas/AgentTurnResponseEvent" - } - }, - "additionalProperties": false, - "required": [ - "event" - ], - "description": "streamed agent turn completion response." - }, - "AgentTurnResponseTurnCompletePayload": { + "ShieldCallStep": { "type": "object", "properties": { - "event_type": { + "turn_id": { + "type": "string" + }, + "step_id": { + "type": "string" + }, + "started_at": { "type": "string", - "const": "turn_complete", - "default": "turn_complete" + "format": "date-time" }, - "turn": { - "$ref": "#/components/schemas/Turn" - } - }, - "additionalProperties": false, - "required": [ - "event_type", - "turn" - ] - }, - "AgentTurnResponseTurnStartPayload": { - "type": "object", - "properties": { - "event_type": { + "completed_at": { + "type": "string", + "format": "date-time" + }, + "step_type": { "type": "string", - "const": "turn_start", - "default": "turn_start" + "const": "shield_call", + "default": "shield_call" }, - "turn_id": { - "type": "string" + "violation": { + "$ref": "#/components/schemas/SafetyViolation" } }, "additionalProperties": false, "required": [ - "event_type", - "turn_id" + "turn_id", + "step_id", + "step_type" ] }, - "EmbeddingsRequest": { + "ToolExecutionStep": { "type": "object", "properties": { - "model_id": { + "turn_id": { + "type": "string" + }, + "step_id": { + "type": "string" + }, + "started_at": { "type": "string", - "description": "The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint." + "format": "date-time" }, - "contents": { + "completed_at": { + "type": "string", + "format": "date-time" + }, + "step_type": { + "type": "string", + "const": "tool_execution", + "default": "tool_execution" + }, + "tool_calls": { "type": "array", "items": { - "$ref": "#/components/schemas/InterleavedContent" - }, - "description": "List of contents to generate embeddings for. Note that content can be multimodal. The behavior depends on the model and provider. Some models may only support text." - } - }, - "additionalProperties": false, - "required": [ - "model_id", - "contents" - ] - }, - "EmbeddingsResponse": { - "type": "object", - "properties": { - "embeddings": { + "$ref": "#/components/schemas/ToolCall" + } + }, + "tool_responses": { "type": "array", "items": { - "type": "array", - "items": { - "type": "number" - } - }, - "description": "List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}" + "$ref": "#/components/schemas/ToolResponse" + } } }, "additionalProperties": false, "required": [ - "embeddings" - ], - "description": "Response containing generated embeddings." + "turn_id", + "step_id", + "step_type", + "tool_calls", + "tool_responses" + ] }, - "AgentCandidate": { + "ToolResponse": { "type": "object", "properties": { - "type": { - "type": "string", - "const": "agent", - "default": "agent" + "call_id": { + "type": "string" }, - "config": { - "$ref": "#/components/schemas/AgentConfig" + "tool_name": { + "oneOf": [ + { + "type": "string", + "enum": [ + "brave_search", + "wolfram_alpha", + "photogen", + "code_interpreter" + ] + }, + { + "type": "string" + } + ] + }, + "content": { + "$ref": "#/components/schemas/InterleavedContent" } }, "additionalProperties": false, "required": [ - "type", - "config" - ] - }, - "AggregationFunctionType": { - "type": "string", - "enum": [ - "average", - "median", - "categorical_count", - "accuracy" + "call_id", + "tool_name", + "content" ] }, - "AppEvalTaskConfig": { + "Turn": { "type": "object", "properties": { - "type": { - "type": "string", - "const": "app", - "default": "app" + "turn_id": { + "type": "string" }, - "eval_candidate": { - "$ref": "#/components/schemas/EvalCandidate" + "session_id": { + "type": "string" }, - "scoring_params": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringFnParams" + "input_messages": { + "type": "array", + "items": { + "oneOf": [ + { + "$ref": "#/components/schemas/UserMessage" + }, + { + "$ref": "#/components/schemas/ToolResponseMessage" + } + ] } }, - "num_examples": { - "type": "integer" - } - }, - "additionalProperties": false, - "required": [ - "type", - "eval_candidate", - "scoring_params" - ] - }, - "BasicScoringFnParams": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "basic", - "default": "basic" + "steps": { + "type": "array", + "items": { + "oneOf": [ + { + "$ref": "#/components/schemas/InferenceStep" + }, + { + "$ref": "#/components/schemas/ToolExecutionStep" + }, + { + "$ref": "#/components/schemas/ShieldCallStep" + }, + { + "$ref": "#/components/schemas/MemoryRetrievalStep" + } + ], + "discriminator": { + "propertyName": "step_type", + "mapping": { + "inference": "#/components/schemas/InferenceStep", + "tool_execution": "#/components/schemas/ToolExecutionStep", + "shield_call": "#/components/schemas/ShieldCallStep", + "memory_retrieval": "#/components/schemas/MemoryRetrievalStep" + } + } + } }, - "aggregation_functions": { + "output_message": { + "$ref": "#/components/schemas/CompletionMessage" + }, + "output_attachments": { "type": "array", "items": { - "$ref": "#/components/schemas/AggregationFunctionType" + "type": "object", + "properties": { + "content": { + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/components/schemas/InterleavedContentItem" + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/InterleavedContentItem" + } + }, + { + "$ref": "#/components/schemas/URL" + } + ] + }, + "mime_type": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "content", + "mime_type" + ] } + }, + "started_at": { + "type": "string", + "format": "date-time" + }, + "completed_at": { + "type": "string", + "format": "date-time" } }, "additionalProperties": false, "required": [ - "type" + "turn_id", + "session_id", + "input_messages", + "steps", + "output_message", + "started_at" + ], + "description": "A single turn in an interaction with an Agentic System." + }, + "ViolationLevel": { + "type": "string", + "enum": [ + "info", + "warn", + "error" ] }, - "BenchmarkEvalTaskConfig": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "benchmark", - "default": "benchmark" - }, - "eval_candidate": { - "$ref": "#/components/schemas/EvalCandidate" - }, - "num_examples": { - "type": "integer" + "AgentTurnResponseEvent": { + "type": "object", + "properties": { + "payload": { + "$ref": "#/components/schemas/AgentTurnResponseEventPayload" } }, "additionalProperties": false, "required": [ - "type", - "eval_candidate" + "payload" ] }, - "EvalCandidate": { + "AgentTurnResponseEventPayload": { "oneOf": [ { - "$ref": "#/components/schemas/ModelCandidate" + "$ref": "#/components/schemas/AgentTurnResponseStepStartPayload" }, { - "$ref": "#/components/schemas/AgentCandidate" - } - ], - "discriminator": { - "propertyName": "type", - "mapping": { - "model": "#/components/schemas/ModelCandidate", - "agent": "#/components/schemas/AgentCandidate" - } - } - }, - "EvalTaskConfig": { - "oneOf": [ + "$ref": "#/components/schemas/AgentTurnResponseStepProgressPayload" + }, + { + "$ref": "#/components/schemas/AgentTurnResponseStepCompletePayload" + }, { - "$ref": "#/components/schemas/BenchmarkEvalTaskConfig" + "$ref": "#/components/schemas/AgentTurnResponseTurnStartPayload" }, { - "$ref": "#/components/schemas/AppEvalTaskConfig" + "$ref": "#/components/schemas/AgentTurnResponseTurnCompletePayload" } ], "discriminator": { - "propertyName": "type", + "propertyName": "event_type", "mapping": { - "benchmark": "#/components/schemas/BenchmarkEvalTaskConfig", - "app": "#/components/schemas/AppEvalTaskConfig" + "step_start": "#/components/schemas/AgentTurnResponseStepStartPayload", + "step_progress": "#/components/schemas/AgentTurnResponseStepProgressPayload", + "step_complete": "#/components/schemas/AgentTurnResponseStepCompletePayload", + "turn_start": "#/components/schemas/AgentTurnResponseTurnStartPayload", + "turn_complete": "#/components/schemas/AgentTurnResponseTurnCompletePayload" } } }, - "LLMAsJudgeScoringFnParams": { + "AgentTurnResponseStepCompletePayload": { "type": "object", "properties": { - "type": { + "event_type": { "type": "string", - "const": "llm_as_judge", - "default": "llm_as_judge" + "const": "step_complete", + "default": "step_complete" }, - "judge_model": { - "type": "string" + "step_type": { + "type": "string", + "enum": [ + "inference", + "tool_execution", + "shield_call", + "memory_retrieval" + ] }, - "prompt_template": { + "step_id": { "type": "string" }, - "judge_score_regexes": { - "type": "array", - "items": { - "type": "string" - } - }, - "aggregation_functions": { - "type": "array", - "items": { - "$ref": "#/components/schemas/AggregationFunctionType" + "step_details": { + "oneOf": [ + { + "$ref": "#/components/schemas/InferenceStep" + }, + { + "$ref": "#/components/schemas/ToolExecutionStep" + }, + { + "$ref": "#/components/schemas/ShieldCallStep" + }, + { + "$ref": "#/components/schemas/MemoryRetrievalStep" + } + ], + "discriminator": { + "propertyName": "step_type", + "mapping": { + "inference": "#/components/schemas/InferenceStep", + "tool_execution": "#/components/schemas/ToolExecutionStep", + "shield_call": "#/components/schemas/ShieldCallStep", + "memory_retrieval": "#/components/schemas/MemoryRetrievalStep" + } } } }, "additionalProperties": false, "required": [ - "type", - "judge_model" + "event_type", + "step_type", + "step_id", + "step_details" ] }, - "ModelCandidate": { + "AgentTurnResponseStepProgressPayload": { "type": "object", "properties": { - "type": { + "event_type": { "type": "string", - "const": "model", - "default": "model" + "const": "step_progress", + "default": "step_progress" }, - "model": { - "type": "string" + "step_type": { + "type": "string", + "enum": [ + "inference", + "tool_execution", + "shield_call", + "memory_retrieval" + ] }, - "sampling_params": { - "$ref": "#/components/schemas/SamplingParams" + "step_id": { + "type": "string" }, - "system_message": { - "$ref": "#/components/schemas/SystemMessage" + "delta": { + "$ref": "#/components/schemas/ContentDelta" } }, "additionalProperties": false, "required": [ - "type", - "model", - "sampling_params" + "event_type", + "step_type", + "step_id", + "delta" ] }, - "RegexParserScoringFnParams": { + "AgentTurnResponseStepStartPayload": { "type": "object", "properties": { - "type": { + "event_type": { "type": "string", - "const": "regex_parser", - "default": "regex_parser" + "const": "step_start", + "default": "step_start" }, - "parsing_regexes": { - "type": "array", - "items": { - "type": "string" - } + "step_type": { + "type": "string", + "enum": [ + "inference", + "tool_execution", + "shield_call", + "memory_retrieval" + ] }, - "aggregation_functions": { - "type": "array", - "items": { - "$ref": "#/components/schemas/AggregationFunctionType" + "step_id": { + "type": "string" + }, + "metadata": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] } } }, "additionalProperties": false, "required": [ - "type" + "event_type", + "step_type", + "step_id" ] }, - "ScoringFnParams": { - "oneOf": [ - { - "$ref": "#/components/schemas/LLMAsJudgeScoringFnParams" - }, - { - "$ref": "#/components/schemas/RegexParserScoringFnParams" - }, - { - "$ref": "#/components/schemas/BasicScoringFnParams" + "AgentTurnResponseStreamChunk": { + "type": "object", + "properties": { + "event": { + "$ref": "#/components/schemas/AgentTurnResponseEvent" } + }, + "additionalProperties": false, + "required": [ + "event" ], - "discriminator": { - "propertyName": "type", - "mapping": { - "llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams", - "regex_parser": "#/components/schemas/RegexParserScoringFnParams", - "basic": "#/components/schemas/BasicScoringFnParams" - } - } + "description": "streamed agent turn completion response." }, - "EvaluateRowsRequest": { + "AgentTurnResponseTurnCompletePayload": { "type": "object", "properties": { - "input_rows": { - "type": "array", - "items": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - } + "event_type": { + "type": "string", + "const": "turn_complete", + "default": "turn_complete" + }, + "turn": { + "$ref": "#/components/schemas/Turn" + } + }, + "additionalProperties": false, + "required": [ + "event_type", + "turn" + ] + }, + "AgentTurnResponseTurnStartPayload": { + "type": "object", + "properties": { + "event_type": { + "type": "string", + "const": "turn_start", + "default": "turn_start" }, - "scoring_functions": { + "turn_id": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "event_type", + "turn_id" + ] + }, + "EmbeddingsRequest": { + "type": "object", + "properties": { + "model_id": { + "type": "string", + "description": "The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint." + }, + "contents": { "type": "array", "items": { - "type": "string" - } - }, - "task_config": { - "$ref": "#/components/schemas/EvalTaskConfig" + "$ref": "#/components/schemas/InterleavedContent" + }, + "description": "List of contents to generate embeddings for. Note that content can be multimodal. The behavior depends on the model and provider. Some models may only support text." } }, "additionalProperties": false, "required": [ - "input_rows", - "scoring_functions", - "task_config" + "model_id", + "contents" ] }, - "EvaluateResponse": { + "EmbeddingsResponse": { "type": "object", "properties": { - "generations": { + "embeddings": { "type": "array", "items": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] + "type": "array", + "items": { + "type": "number" } - } - }, - "scores": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringResult" - } + }, + "description": "List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}" } }, "additionalProperties": false, "required": [ - "generations", - "scores" - ] + "embeddings" + ], + "description": "Response containing generated embeddings." }, - "ScoringResult": { + "EvaluateRowsRequest": { "type": "object", "properties": { - "score_rows": { + "input_rows": { "type": "array", "items": { "type": "object", @@ -4940,36 +5410,21 @@ } } }, - "aggregated_results": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] + "scoring_functions": { + "type": "array", + "items": { + "type": "string" } + }, + "task_config": { + "$ref": "#/components/schemas/BenchmarkConfig" } }, "additionalProperties": false, "required": [ - "score_rows", - "aggregated_results" + "input_rows", + "scoring_functions", + "task_config" ] }, "Session": { @@ -5287,69 +5742,6 @@ "type" ] }, - "EvalTask": { - "type": "object", - "properties": { - "identifier": { - "type": "string" - }, - "provider_resource_id": { - "type": "string" - }, - "provider_id": { - "type": "string" - }, - "type": { - "type": "string", - "const": "eval_task", - "default": "eval_task" - }, - "dataset_id": { - "type": "string" - }, - "scoring_functions": { - "type": "array", - "items": { - "type": "string" - } - }, - "metadata": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - } - }, - "additionalProperties": false, - "required": [ - "identifier", - "provider_resource_id", - "provider_id", - "type", - "dataset_id", - "scoring_functions", - "metadata" - ] - }, "Model": { "type": "object", "properties": { @@ -5891,15 +6283,6 @@ ], "description": "Artifacts of a finetuning job." }, - "JobStatus": { - "type": "string", - "enum": [ - "completed", - "in_progress", - "failed", - "scheduled" - ] - }, "PostTrainingJobStatusResponse": { "type": "object", "properties": { @@ -6243,21 +6626,6 @@ "data" ] }, - "ListEvalTasksResponse": { - "type": "object", - "properties": { - "data": { - "type": "array", - "items": { - "$ref": "#/components/schemas/EvalTask" - } - } - }, - "additionalProperties": false, - "required": [ - "data" - ] - }, "ListModelsResponse": { "type": "object", "properties": { @@ -7169,22 +7537,22 @@ "data" ] }, - "RegisterDatasetRequest": { + "RegisterBenchmarkRequest": { "type": "object", "properties": { + "benchmark_id": { + "type": "string" + }, "dataset_id": { "type": "string" }, - "dataset_schema": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ParamType" + "scoring_functions": { + "type": "array", + "items": { + "type": "string" } }, - "url": { - "$ref": "#/components/schemas/URL" - }, - "provider_dataset_id": { + "provider_benchmark_id": { "type": "string" }, "provider_id": { @@ -7218,27 +7586,27 @@ }, "additionalProperties": false, "required": [ + "benchmark_id", "dataset_id", - "dataset_schema", - "url" + "scoring_functions" ] }, - "RegisterEvalTaskRequest": { + "RegisterDatasetRequest": { "type": "object", "properties": { - "eval_task_id": { - "type": "string" - }, "dataset_id": { "type": "string" }, - "scoring_functions": { - "type": "array", - "items": { - "type": "string" + "dataset_schema": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ParamType" } }, - "provider_eval_task_id": { + "url": { + "$ref": "#/components/schemas/URL" + }, + "provider_dataset_id": { "type": "string" }, "provider_id": { @@ -7272,9 +7640,9 @@ }, "additionalProperties": false, "required": [ - "eval_task_id", "dataset_id", - "scoring_functions" + "dataset_schema", + "url" ] }, "RegisterModelRequest": { @@ -7468,7 +7836,7 @@ "type": "object", "properties": { "task_config": { - "$ref": "#/components/schemas/EvalTaskConfig" + "$ref": "#/components/schemas/BenchmarkConfig" } }, "additionalProperties": false, @@ -7476,18 +7844,6 @@ "task_config" ] }, - "Job": { - "type": "object", - "properties": { - "job_id": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "job_id" - ] - }, "RunShieldRequest": { "type": "object", "properties": { @@ -7970,6 +8326,9 @@ { "name": "BatchInference (Coming Soon)" }, + { + "name": "Benchmarks" + }, { "name": "DatasetIO" }, @@ -7979,9 +8338,6 @@ { "name": "Eval" }, - { - "name": "EvalTasks" - }, { "name": "Inference", "description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.", @@ -8033,10 +8389,10 @@ "tags": [ "Agents", "BatchInference (Coming Soon)", + "Benchmarks", "DatasetIO", "Datasets", "Eval", - "EvalTasks", "Inference", "Inspect", "Models", diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index a646d7e089..b30025020b 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -10,6 +10,175 @@ info: servers: - url: http://any-hosted-llama-stack.com paths: + /v1/eval/tasks/{task_id}/evaluations: + post: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/EvaluateResponse' + tags: + - Eval + description: '' + parameters: + - name: task_id + in: path + required: true + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/DeprecatedEvaluateRowsRequest' + required: true + deprecated: true + /v1/eval-tasks/{task_id}: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + oneOf: + - $ref: '#/components/schemas/Benchmark' + - type: 'null' + tags: + - Benchmarks + description: '' + parameters: + - name: eval_task_id + in: query + required: true + schema: + type: string + deprecated: true + /v1/eval/tasks/{task_id}/jobs/{job_id}: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + oneOf: + - $ref: '#/components/schemas/JobStatus' + - type: 'null' + tags: + - Eval + description: '' + parameters: + - name: task_id + in: path + required: true + schema: + type: string + - name: job_id + in: path + required: true + schema: + type: string + deprecated: true + delete: + responses: + '200': + description: OK + tags: + - Eval + description: '' + parameters: + - name: task_id + in: path + required: true + schema: + type: string + - name: job_id + in: path + required: true + schema: + type: string + deprecated: true + /v1/eval/tasks/{task_id}/jobs/{job_id}/result: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/EvaluateResponse' + tags: + - Eval + description: '' + parameters: + - name: task_id + in: path + required: true + schema: + type: string + - name: job_id + in: path + required: true + schema: + type: string + deprecated: true + /v1/eval-tasks: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/ListBenchmarksResponse' + tags: + - Benchmarks + description: '' + parameters: [] + deprecated: true + post: + responses: + '200': + description: OK + tags: + - Benchmarks + description: '' + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/DeprecatedRegisterEvalTaskRequest' + required: true + deprecated: true + /v1/eval/tasks/{task_id}/jobs: + post: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/Job' + tags: + - Eval + description: '' + parameters: + - name: task_id + in: path + required: true + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/DeprecatedRunEvalRequest' + required: true + deprecated: true /v1/datasetio/rows: get: responses: @@ -322,7 +491,7 @@ paths: schema: $ref: '#/components/schemas/EmbeddingsRequest' required: true - /v1/eval/tasks/{task_id}/evaluations: + /v1/eval/benchmarks/{benchmark_id}/evaluations: post: responses: '200': @@ -335,7 +504,7 @@ paths: - Eval description: '' parameters: - - name: task_id + - name: benchmark_id in: path required: true schema: @@ -407,7 +576,7 @@ paths: required: true schema: type: string - /v1/datasets/{dataset_id}: + /v1/eval/benchmarks/{benchmark_id}: get: responses: '200': @@ -416,21 +585,28 @@ paths: application/json: schema: oneOf: - - $ref: '#/components/schemas/Dataset' + - $ref: '#/components/schemas/Benchmark' - type: 'null' tags: - - Datasets + - Benchmarks description: '' parameters: - - name: dataset_id + - name: benchmark_id in: path required: true schema: type: string - delete: + /v1/datasets/{dataset_id}: + get: responses: '200': description: OK + content: + application/json: + schema: + oneOf: + - $ref: '#/components/schemas/Dataset' + - type: 'null' tags: - Datasets description: '' @@ -440,22 +616,15 @@ paths: required: true schema: type: string - /v1/eval-tasks/{eval_task_id}: - get: + delete: responses: '200': description: OK - content: - application/json: - schema: - oneOf: - - $ref: '#/components/schemas/EvalTask' - - type: 'null' tags: - - EvalTasks + - Datasets description: '' parameters: - - name: eval_task_id + - name: dataset_id in: path required: true schema: @@ -802,7 +971,7 @@ paths: schema: $ref: '#/components/schemas/InvokeToolRequest' required: true - /v1/eval/tasks/{task_id}/jobs/{job_id}: + /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}: get: responses: '200': @@ -817,7 +986,7 @@ paths: - Eval description: '' parameters: - - name: task_id + - name: benchmark_id in: path required: true schema: @@ -835,7 +1004,7 @@ paths: - Eval description: '' parameters: - - name: task_id + - name: benchmark_id in: path required: true schema: @@ -845,7 +1014,7 @@ paths: required: true schema: type: string - /v1/eval/tasks/{task_id}/jobs/{job_id}/result: + /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result: get: responses: '200': @@ -858,17 +1027,17 @@ paths: - Eval description: '' parameters: - - name: job_id + - name: benchmark_id in: path required: true schema: type: string - - name: task_id + - name: job_id in: path required: true schema: type: string - /v1/datasets: + /v1/eval/benchmarks: get: responses: '200': @@ -876,9 +1045,9 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/ListDatasetsResponse' + $ref: '#/components/schemas/ListBenchmarksResponse' tags: - - Datasets + - Benchmarks description: '' parameters: [] post: @@ -886,16 +1055,16 @@ paths: '200': description: OK tags: - - Datasets + - Benchmarks description: '' parameters: [] requestBody: content: application/json: schema: - $ref: '#/components/schemas/RegisterDatasetRequest' + $ref: '#/components/schemas/RegisterBenchmarkRequest' required: true - /v1/eval-tasks: + /v1/datasets: get: responses: '200': @@ -903,9 +1072,9 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/ListEvalTasksResponse' + $ref: '#/components/schemas/ListDatasetsResponse' tags: - - EvalTasks + - Datasets description: '' parameters: [] post: @@ -913,14 +1082,14 @@ paths: '200': description: OK tags: - - EvalTasks + - Datasets description: '' parameters: [] requestBody: content: application/json: schema: - $ref: '#/components/schemas/RegisterEvalTaskRequest' + $ref: '#/components/schemas/RegisterDatasetRequest' required: true /v1/models: get: @@ -1278,7 +1447,7 @@ paths: type: array items: type: string - /v1/eval/tasks/{task_id}/jobs: + /v1/eval/benchmarks/{benchmark_id}/jobs: post: responses: '200': @@ -1291,7 +1460,7 @@ paths: - Eval description: '' parameters: - - name: task_id + - name: benchmark_id in: path required: true schema: @@ -1429,65 +1598,146 @@ jsonSchemaDialect: >- https://json-schema.org/draft/2020-12/schema components: schemas: - AppendRowsRequest: + AgentCandidate: type: object properties: - dataset_id: + type: type: string - rows: - type: array - items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object + const: agent + default: agent + config: + $ref: '#/components/schemas/AgentConfig' additionalProperties: false required: - - dataset_id - - rows - CompletionMessage: + - type + - config + AgentConfig: type: object properties: - role: - type: string - const: assistant - default: assistant - description: >- - Must be "assistant" to identify this as the model's response - content: - $ref: '#/components/schemas/InterleavedContent' - description: The content of the model's response - stop_reason: - type: string - enum: - - end_of_turn - - end_of_message - - out_of_tokens - description: >- - Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: - The model finished generating the entire response. - `StopReason.end_of_message`: - The model finished generating but generated a partial response -- usually, - a tool call. The user may call the tool and continue the conversation - with the tool's response. - `StopReason.out_of_tokens`: The model ran - out of token budget. - tool_calls: + sampling_params: + $ref: '#/components/schemas/SamplingParams' + input_shields: type: array items: - $ref: '#/components/schemas/ToolCall' - description: >- - List of tool calls. Each tool call is a ToolCall object. - additionalProperties: false - required: - - role - - content - - stop_reason - description: >- - A message containing the model's (assistant) response in a chat conversation. + type: string + output_shields: + type: array + items: + type: string + toolgroups: + type: array + items: + $ref: '#/components/schemas/AgentTool' + client_tools: + type: array + items: + $ref: '#/components/schemas/ToolDef' + tool_choice: + type: string + enum: + - auto + - required + description: >- + Whether tool use is required or automatic. This is a hint to the model + which may not be followed. It depends on the Instruction Following capabilities + of the model. + tool_prompt_format: + type: string + enum: + - json + - function_tag + - python_list + description: >- + Prompt format for calling custom / zero shot tools. + tool_config: + $ref: '#/components/schemas/ToolConfig' + max_infer_iters: + type: integer + default: 10 + model: + type: string + instructions: + type: string + enable_session_persistence: + type: boolean + response_format: + $ref: '#/components/schemas/ResponseFormat' + additionalProperties: false + required: + - model + - instructions + - enable_session_persistence + AgentTool: + oneOf: + - type: string + - type: object + properties: + name: + type: string + args: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - name + - args + AggregationFunctionType: + type: string + enum: + - average + - median + - categorical_count + - accuracy + BasicScoringFnParams: + type: object + properties: + type: + type: string + const: basic + default: basic + aggregation_functions: + type: array + items: + $ref: '#/components/schemas/AggregationFunctionType' + additionalProperties: false + required: + - type + BenchmarkConfig: + type: object + properties: + type: + type: string + const: benchmark + default: benchmark + eval_candidate: + $ref: '#/components/schemas/EvalCandidate' + scoring_params: + type: object + additionalProperties: + $ref: '#/components/schemas/ScoringFnParams' + num_examples: + type: integer + additionalProperties: false + required: + - type + - eval_candidate + - scoring_params + EvalCandidate: + oneOf: + - $ref: '#/components/schemas/ModelCandidate' + - $ref: '#/components/schemas/AgentCandidate' + discriminator: + propertyName: type + mapping: + model: '#/components/schemas/ModelCandidate' + agent: '#/components/schemas/AgentCandidate' GrammarResponseFormat: type: object properties: @@ -1598,19 +1848,65 @@ components: - json_schema description: >- Configuration for JSON schema-guided response generation. - Message: - oneOf: - - $ref: '#/components/schemas/UserMessage' - - $ref: '#/components/schemas/SystemMessage' - - $ref: '#/components/schemas/ToolResponseMessage' - - $ref: '#/components/schemas/CompletionMessage' - discriminator: - propertyName: role - mapping: - user: '#/components/schemas/UserMessage' - system: '#/components/schemas/SystemMessage' - tool: '#/components/schemas/ToolResponseMessage' - assistant: '#/components/schemas/CompletionMessage' + LLMAsJudgeScoringFnParams: + type: object + properties: + type: + type: string + const: llm_as_judge + default: llm_as_judge + judge_model: + type: string + prompt_template: + type: string + judge_score_regexes: + type: array + items: + type: string + aggregation_functions: + type: array + items: + $ref: '#/components/schemas/AggregationFunctionType' + additionalProperties: false + required: + - type + - judge_model + ModelCandidate: + type: object + properties: + type: + type: string + const: model + default: model + model: + type: string + sampling_params: + $ref: '#/components/schemas/SamplingParams' + system_message: + $ref: '#/components/schemas/SystemMessage' + additionalProperties: false + required: + - type + - model + - sampling_params + RegexParserScoringFnParams: + type: object + properties: + type: + type: string + const: regex_parser + default: regex_parser + parsing_regexes: + type: array + items: + type: string + aggregation_functions: + type: array + items: + $ref: '#/components/schemas/AggregationFunctionType' + additionalProperties: false + required: + - type ResponseFormat: oneOf: - $ref: '#/components/schemas/JsonSchemaResponseFormat' @@ -1645,6 +1941,17 @@ components: greedy: '#/components/schemas/GreedySamplingStrategy' top_p: '#/components/schemas/TopPSamplingStrategy' top_k: '#/components/schemas/TopKSamplingStrategy' + ScoringFnParams: + oneOf: + - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams' + - $ref: '#/components/schemas/RegexParserScoringFnParams' + - $ref: '#/components/schemas/BasicScoringFnParams' + discriminator: + propertyName: type + mapping: + llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams' + regex_parser: '#/components/schemas/RegexParserScoringFnParams' + basic: '#/components/schemas/BasicScoringFnParams' SystemMessage: type: object properties: @@ -1683,75 +1990,76 @@ components: - type - text description: A text content item - ToolCall: + ToolConfig: type: object properties: - call_id: + tool_choice: + type: string + enum: + - auto + - required + description: >- + (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. + default: auto + tool_prompt_format: + type: string + enum: + - json + - function_tag + - python_list + description: >- + (Optional) Instructs the model how to format tool calls. By default, Llama + Stack will attempt to use a format that is best adapted to the model. + - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. + - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a + tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python + syntax -- a list of function calls. + system_message_behavior: + type: string + enum: + - append + - replace + description: >- + (Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: + Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: + Replaces the default system prompt with the provided system message. The + system message can include the string '{{function_definitions}}' to indicate + where the function definitions should be inserted. + default: append + additionalProperties: false + required: + - system_message_behavior + description: Configuration for tool use. + ToolDef: + type: object + properties: + name: type: string - tool_name: - oneOf: - - type: string - enum: - - brave_search - - wolfram_alpha - - photogen - - code_interpreter - - type: string - arguments: - type: object - additionalProperties: - oneOf: - - type: string - - type: integer - - type: number - - type: boolean - - type: 'null' - - type: array - items: - oneOf: - - type: string - - type: integer - - type: number - - type: boolean - - type: 'null' - - type: object - additionalProperties: - oneOf: - - type: string - - type: integer - - type: number - - type: boolean - - type: 'null' - additionalProperties: false - required: - - call_id - - tool_name - - arguments - ToolDefinition: - type: object - properties: - tool_name: - oneOf: - - type: string - enum: - - brave_search - - wolfram_alpha - - photogen - - code_interpreter - - type: string description: type: string parameters: + type: array + items: + $ref: '#/components/schemas/ToolParameter' + metadata: type: object additionalProperties: - $ref: '#/components/schemas/ToolParamDefinition' + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object additionalProperties: false required: - - tool_name - ToolParamDefinition: + - name + ToolParameter: type: object properties: - param_type: + name: + type: string + parameter_type: type: string description: type: string @@ -1768,41 +2076,10 @@ components: - type: object additionalProperties: false required: - - param_type - ToolResponseMessage: - type: object - properties: - role: - type: string - const: tool - default: tool - description: >- - Must be "tool" to identify this as a tool response - call_id: - type: string - description: >- - Unique identifier for the tool call this response is for - tool_name: - oneOf: - - type: string - enum: - - brave_search - - wolfram_alpha - - photogen - - code_interpreter - - type: string - description: Name of the tool that was called - content: - $ref: '#/components/schemas/InterleavedContent' - description: The response content from the tool - additionalProperties: false - required: - - role - - call_id - - tool_name - - content - description: >- - A message representing the result of a tool invocation. + - name + - parameter_type + - description + - required TopKSamplingStrategy: type: object properties: @@ -1834,11 +2111,382 @@ components: URL: type: object properties: - uri: + uri: + type: string + additionalProperties: false + required: + - uri + DeprecatedEvaluateRowsRequest: + type: object + properties: + input_rows: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + scoring_functions: + type: array + items: + type: string + task_config: + $ref: '#/components/schemas/BenchmarkConfig' + additionalProperties: false + required: + - input_rows + - scoring_functions + - task_config + EvaluateResponse: + type: object + properties: + generations: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + scores: + type: object + additionalProperties: + $ref: '#/components/schemas/ScoringResult' + additionalProperties: false + required: + - generations + - scores + ScoringResult: + type: object + properties: + score_rows: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + aggregated_results: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - score_rows + - aggregated_results + Benchmark: + type: object + properties: + identifier: + type: string + provider_resource_id: + type: string + provider_id: + type: string + type: + type: string + const: benchmark + default: benchmark + dataset_id: + type: string + scoring_functions: + type: array + items: + type: string + metadata: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - identifier + - provider_resource_id + - provider_id + - type + - dataset_id + - scoring_functions + - metadata + JobStatus: + type: string + enum: + - completed + - in_progress + - failed + - scheduled + ListBenchmarksResponse: + type: object + properties: + data: + type: array + items: + $ref: '#/components/schemas/Benchmark' + additionalProperties: false + required: + - data + DeprecatedRegisterEvalTaskRequest: + type: object + properties: + eval_task_id: + type: string + dataset_id: + type: string + scoring_functions: + type: array + items: + type: string + provider_benchmark_id: + type: string + provider_id: + type: string + metadata: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - eval_task_id + - dataset_id + - scoring_functions + DeprecatedRunEvalRequest: + type: object + properties: + task_config: + $ref: '#/components/schemas/BenchmarkConfig' + additionalProperties: false + required: + - task_config + Job: + type: object + properties: + job_id: + type: string + additionalProperties: false + required: + - job_id + AppendRowsRequest: + type: object + properties: + dataset_id: + type: string + rows: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - dataset_id + - rows + CompletionMessage: + type: object + properties: + role: + type: string + const: assistant + default: assistant + description: >- + Must be "assistant" to identify this as the model's response + content: + $ref: '#/components/schemas/InterleavedContent' + description: The content of the model's response + stop_reason: + type: string + enum: + - end_of_turn + - end_of_message + - out_of_tokens + description: >- + Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: + The model finished generating the entire response. - `StopReason.end_of_message`: + The model finished generating but generated a partial response -- usually, + a tool call. The user may call the tool and continue the conversation + with the tool's response. - `StopReason.out_of_tokens`: The model ran + out of token budget. + tool_calls: + type: array + items: + $ref: '#/components/schemas/ToolCall' + description: >- + List of tool calls. Each tool call is a ToolCall object. + additionalProperties: false + required: + - role + - content + - stop_reason + description: >- + A message containing the model's (assistant) response in a chat conversation. + Message: + oneOf: + - $ref: '#/components/schemas/UserMessage' + - $ref: '#/components/schemas/SystemMessage' + - $ref: '#/components/schemas/ToolResponseMessage' + - $ref: '#/components/schemas/CompletionMessage' + discriminator: + propertyName: role + mapping: + user: '#/components/schemas/UserMessage' + system: '#/components/schemas/SystemMessage' + tool: '#/components/schemas/ToolResponseMessage' + assistant: '#/components/schemas/CompletionMessage' + ToolCall: + type: object + properties: + call_id: + type: string + tool_name: + oneOf: + - type: string + enum: + - brave_search + - wolfram_alpha + - photogen + - code_interpreter + - type: string + arguments: + type: object + additionalProperties: + oneOf: + - type: string + - type: integer + - type: number + - type: boolean + - type: 'null' + - type: array + items: + oneOf: + - type: string + - type: integer + - type: number + - type: boolean + - type: 'null' + - type: object + additionalProperties: + oneOf: + - type: string + - type: integer + - type: number + - type: boolean + - type: 'null' + additionalProperties: false + required: + - call_id + - tool_name + - arguments + ToolDefinition: + type: object + properties: + tool_name: + oneOf: + - type: string + enum: + - brave_search + - wolfram_alpha + - photogen + - code_interpreter + - type: string + description: + type: string + parameters: + type: object + additionalProperties: + $ref: '#/components/schemas/ToolParamDefinition' + additionalProperties: false + required: + - tool_name + ToolParamDefinition: + type: object + properties: + param_type: + type: string + description: + type: string + required: + type: boolean + default: true + default: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - param_type + ToolResponseMessage: + type: object + properties: + role: + type: string + const: tool + default: tool + description: >- + Must be "tool" to identify this as a tool response + call_id: type: string + description: >- + Unique identifier for the tool call this response is for + tool_name: + oneOf: + - type: string + enum: + - brave_search + - wolfram_alpha + - photogen + - code_interpreter + - type: string + description: Name of the tool that was called + content: + $ref: '#/components/schemas/InterleavedContent' + description: The response content from the tool additionalProperties: false required: - - uri + - role + - call_id + - tool_name + - content + description: >- + A message representing the result of a tool invocation. UserMessage: type: object properties: @@ -2063,46 +2711,6 @@ components: additionalProperties: false required: - job_uuid - ToolConfig: - type: object - properties: - tool_choice: - type: string - enum: - - auto - - required - description: >- - (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. - default: auto - tool_prompt_format: - type: string - enum: - - json - - function_tag - - python_list - description: >- - (Optional) Instructs the model how to format tool calls. By default, Llama - Stack will attempt to use a format that is best adapted to the model. - - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a - tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python - syntax -- a list of function calls. - system_message_behavior: - type: string - enum: - - append - - replace - description: >- - (Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: - Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: - Replaces the default system prompt with the provided system message. The - system message can include the string '{{function_definitions}}' to indicate - where the function definitions should be inserted. - default: append - additionalProperties: false - required: - - system_message_behavior - description: Configuration for tool use. ChatCompletionRequest: type: object properties: @@ -2251,238 +2859,111 @@ components: type: string contentEncoding: base64 additionalProperties: false - required: - - type - - image - TextDelta: - type: object - properties: - type: - type: string - const: text - default: text - text: - type: string - additionalProperties: false - required: - - type - - text - ToolCallDelta: - type: object - properties: - type: - type: string - const: tool_call - default: tool_call - tool_call: - oneOf: - - type: string - - $ref: '#/components/schemas/ToolCall' - parse_status: - type: string - enum: - - started - - in_progress - - failed - - succeeded - additionalProperties: false - required: - - type - - tool_call - - parse_status - CompletionRequest: - type: object - properties: - model_id: - type: string - description: >- - The identifier of the model to use. The model must be registered with - Llama Stack and available via the /models endpoint. - content: - $ref: '#/components/schemas/InterleavedContent' - description: The content to generate a completion for - sampling_params: - $ref: '#/components/schemas/SamplingParams' - description: >- - (Optional) Parameters to control the sampling strategy - response_format: - $ref: '#/components/schemas/ResponseFormat' - description: >- - (Optional) Grammar specification for guided (structured) decoding - stream: - type: boolean - description: >- - (Optional) If True, generate an SSE event stream of the response. Defaults - to False. - logprobs: - type: object - properties: - top_k: - type: integer - default: 0 - description: >- - How many tokens (for each position) to return log probabilities for. - additionalProperties: false - description: >- - (Optional) If specified, log probabilities for each token position will - be returned. - additionalProperties: false - required: - - model_id - - content - CompletionResponseStreamChunk: - type: object - properties: - delta: - type: string - description: >- - New content generated since last chunk. This can be one or more tokens. - stop_reason: - type: string - enum: - - end_of_turn - - end_of_message - - out_of_tokens - description: >- - Optional reason why generation stopped, if complete - logprobs: - type: array - items: - $ref: '#/components/schemas/TokenLogProbs' - description: >- - Optional log probabilities for generated tokens - additionalProperties: false - required: - - delta - description: >- - A chunk of a streamed completion response. - AgentConfig: - type: object - properties: - sampling_params: - $ref: '#/components/schemas/SamplingParams' - input_shields: - type: array - items: - type: string - output_shields: - type: array - items: - type: string - toolgroups: - type: array - items: - $ref: '#/components/schemas/AgentTool' - client_tools: - type: array - items: - $ref: '#/components/schemas/ToolDef' - tool_choice: - type: string - enum: - - auto - - required - description: >- - Whether tool use is required or automatic. This is a hint to the model - which may not be followed. It depends on the Instruction Following capabilities - of the model. - tool_prompt_format: - type: string - enum: - - json - - function_tag - - python_list - description: >- - Prompt format for calling custom / zero shot tools. - tool_config: - $ref: '#/components/schemas/ToolConfig' - max_infer_iters: - type: integer - default: 10 - model: - type: string - instructions: - type: string - enable_session_persistence: - type: boolean - response_format: - $ref: '#/components/schemas/ResponseFormat' - additionalProperties: false - required: - - model - - instructions - - enable_session_persistence - AgentTool: - oneOf: - - type: string - - type: object - properties: - name: - type: string - args: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - additionalProperties: false - required: - - name - - args - ToolDef: + required: + - type + - image + TextDelta: type: object properties: - name: + type: type: string - description: + const: text + default: text + text: type: string - parameters: - type: array - items: - $ref: '#/components/schemas/ToolParameter' - metadata: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object additionalProperties: false required: - - name - ToolParameter: + - type + - text + ToolCallDelta: type: object properties: - name: + type: type: string - parameter_type: + const: tool_call + default: tool_call + tool_call: + oneOf: + - type: string + - $ref: '#/components/schemas/ToolCall' + parse_status: type: string - description: + enum: + - started + - in_progress + - failed + - succeeded + additionalProperties: false + required: + - type + - tool_call + - parse_status + CompletionRequest: + type: object + properties: + model_id: type: string - required: + description: >- + The identifier of the model to use. The model must be registered with + Llama Stack and available via the /models endpoint. + content: + $ref: '#/components/schemas/InterleavedContent' + description: The content to generate a completion for + sampling_params: + $ref: '#/components/schemas/SamplingParams' + description: >- + (Optional) Parameters to control the sampling strategy + response_format: + $ref: '#/components/schemas/ResponseFormat' + description: >- + (Optional) Grammar specification for guided (structured) decoding + stream: type: boolean - default: true - default: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object + description: >- + (Optional) If True, generate an SSE event stream of the response. Defaults + to False. + logprobs: + type: object + properties: + top_k: + type: integer + default: 0 + description: >- + How many tokens (for each position) to return log probabilities for. + additionalProperties: false + description: >- + (Optional) If specified, log probabilities for each token position will + be returned. additionalProperties: false required: - - name - - parameter_type - - description - - required + - model_id + - content + CompletionResponseStreamChunk: + type: object + properties: + delta: + type: string + description: >- + New content generated since last chunk. This can be one or more tokens. + stop_reason: + type: string + enum: + - end_of_turn + - end_of_message + - out_of_tokens + description: >- + Optional reason why generation stopped, if complete + logprobs: + type: array + items: + $ref: '#/components/schemas/TokenLogProbs' + description: >- + Optional log probabilities for generated tokens + additionalProperties: false + required: + - delta + description: >- + A chunk of a streamed completion response. CreateAgentRequest: type: object properties: @@ -2893,232 +3374,75 @@ components: type: object properties: event: - $ref: '#/components/schemas/AgentTurnResponseEvent' - additionalProperties: false - required: - - event - description: streamed agent turn completion response. - AgentTurnResponseTurnCompletePayload: - type: object - properties: - event_type: - type: string - const: turn_complete - default: turn_complete - turn: - $ref: '#/components/schemas/Turn' - additionalProperties: false - required: - - event_type - - turn - AgentTurnResponseTurnStartPayload: - type: object - properties: - event_type: - type: string - const: turn_start - default: turn_start - turn_id: - type: string - additionalProperties: false - required: - - event_type - - turn_id - EmbeddingsRequest: - type: object - properties: - model_id: - type: string - description: >- - The identifier of the model to use. The model must be an embedding model - registered with Llama Stack and available via the /models endpoint. - contents: - type: array - items: - $ref: '#/components/schemas/InterleavedContent' - description: >- - List of contents to generate embeddings for. Note that content can be - multimodal. The behavior depends on the model and provider. Some models - may only support text. - additionalProperties: false - required: - - model_id - - contents - EmbeddingsResponse: - type: object - properties: - embeddings: - type: array - items: - type: array - items: - type: number - description: >- - List of embedding vectors, one per input content. Each embedding is a - list of floats. The dimensionality of the embedding is model-specific; - you can check model metadata using /models/{model_id} - additionalProperties: false - required: - - embeddings - description: >- - Response containing generated embeddings. - AgentCandidate: - type: object - properties: - type: - type: string - const: agent - default: agent - config: - $ref: '#/components/schemas/AgentConfig' - additionalProperties: false - required: - - type - - config - AggregationFunctionType: - type: string - enum: - - average - - median - - categorical_count - - accuracy - AppEvalTaskConfig: - type: object - properties: - type: - type: string - const: app - default: app - eval_candidate: - $ref: '#/components/schemas/EvalCandidate' - scoring_params: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringFnParams' - num_examples: - type: integer - additionalProperties: false - required: - - type - - eval_candidate - - scoring_params - BasicScoringFnParams: - type: object - properties: - type: - type: string - const: basic - default: basic - aggregation_functions: - type: array - items: - $ref: '#/components/schemas/AggregationFunctionType' - additionalProperties: false - required: - - type - BenchmarkEvalTaskConfig: - type: object - properties: - type: - type: string - const: benchmark - default: benchmark - eval_candidate: - $ref: '#/components/schemas/EvalCandidate' - num_examples: - type: integer + $ref: '#/components/schemas/AgentTurnResponseEvent' additionalProperties: false required: - - type - - eval_candidate - EvalCandidate: - oneOf: - - $ref: '#/components/schemas/ModelCandidate' - - $ref: '#/components/schemas/AgentCandidate' - discriminator: - propertyName: type - mapping: - model: '#/components/schemas/ModelCandidate' - agent: '#/components/schemas/AgentCandidate' - EvalTaskConfig: - oneOf: - - $ref: '#/components/schemas/BenchmarkEvalTaskConfig' - - $ref: '#/components/schemas/AppEvalTaskConfig' - discriminator: - propertyName: type - mapping: - benchmark: '#/components/schemas/BenchmarkEvalTaskConfig' - app: '#/components/schemas/AppEvalTaskConfig' - LLMAsJudgeScoringFnParams: + - event + description: streamed agent turn completion response. + AgentTurnResponseTurnCompletePayload: type: object properties: - type: - type: string - const: llm_as_judge - default: llm_as_judge - judge_model: - type: string - prompt_template: + event_type: type: string - judge_score_regexes: - type: array - items: - type: string - aggregation_functions: - type: array - items: - $ref: '#/components/schemas/AggregationFunctionType' + const: turn_complete + default: turn_complete + turn: + $ref: '#/components/schemas/Turn' additionalProperties: false required: - - type - - judge_model - ModelCandidate: + - event_type + - turn + AgentTurnResponseTurnStartPayload: type: object properties: - type: + event_type: type: string - const: model - default: model - model: + const: turn_start + default: turn_start + turn_id: type: string - sampling_params: - $ref: '#/components/schemas/SamplingParams' - system_message: - $ref: '#/components/schemas/SystemMessage' additionalProperties: false required: - - type - - model - - sampling_params - RegexParserScoringFnParams: + - event_type + - turn_id + EmbeddingsRequest: type: object properties: - type: + model_id: type: string - const: regex_parser - default: regex_parser - parsing_regexes: + description: >- + The identifier of the model to use. The model must be an embedding model + registered with Llama Stack and available via the /models endpoint. + contents: type: array items: - type: string - aggregation_functions: + $ref: '#/components/schemas/InterleavedContent' + description: >- + List of contents to generate embeddings for. Note that content can be + multimodal. The behavior depends on the model and provider. Some models + may only support text. + additionalProperties: false + required: + - model_id + - contents + EmbeddingsResponse: + type: object + properties: + embeddings: type: array items: - $ref: '#/components/schemas/AggregationFunctionType' + type: array + items: + type: number + description: >- + List of embedding vectors, one per input content. Each embedding is a + list of floats. The dimensionality of the embedding is model-specific; + you can check model metadata using /models/{model_id} additionalProperties: false required: - - type - ScoringFnParams: - oneOf: - - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams' - - $ref: '#/components/schemas/RegexParserScoringFnParams' - - $ref: '#/components/schemas/BasicScoringFnParams' - discriminator: - propertyName: type - mapping: - llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams' - regex_parser: '#/components/schemas/RegexParserScoringFnParams' - basic: '#/components/schemas/BasicScoringFnParams' + - embeddings + description: >- + Response containing generated embeddings. EvaluateRowsRequest: type: object properties: @@ -3139,64 +3463,12 @@ components: items: type: string task_config: - $ref: '#/components/schemas/EvalTaskConfig' + $ref: '#/components/schemas/BenchmarkConfig' additionalProperties: false required: - input_rows - scoring_functions - task_config - EvaluateResponse: - type: object - properties: - generations: - type: array - items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - scores: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringResult' - additionalProperties: false - required: - - generations - - scores - ScoringResult: - type: object - properties: - score_rows: - type: array - items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - aggregated_results: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - additionalProperties: false - required: - - score_rows - - aggregated_results Session: type: object properties: @@ -3401,44 +3673,6 @@ components: additionalProperties: false required: - type - EvalTask: - type: object - properties: - identifier: - type: string - provider_resource_id: - type: string - provider_id: - type: string - type: - type: string - const: eval_task - default: eval_task - dataset_id: - type: string - scoring_functions: - type: array - items: - type: string - metadata: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - additionalProperties: false - required: - - identifier - - provider_resource_id - - provider_id - - type - - dataset_id - - scoring_functions - - metadata Model: type: object properties: @@ -3766,13 +4000,6 @@ components: - job_uuid - checkpoints description: Artifacts of a finetuning job. - JobStatus: - type: string - enum: - - completed - - in_progress - - failed - - scheduled PostTrainingJobStatusResponse: type: object properties: @@ -3977,16 +4204,6 @@ components: additionalProperties: false required: - data - ListEvalTasksResponse: - type: object - properties: - data: - type: array - items: - $ref: '#/components/schemas/EvalTask' - additionalProperties: false - required: - - data ListModelsResponse: type: object properties: @@ -4569,18 +4786,18 @@ components: additionalProperties: false required: - data - RegisterDatasetRequest: + RegisterBenchmarkRequest: type: object properties: + benchmark_id: + type: string dataset_id: type: string - dataset_schema: - type: object - additionalProperties: - $ref: '#/components/schemas/ParamType' - url: - $ref: '#/components/schemas/URL' - provider_dataset_id: + scoring_functions: + type: array + items: + type: string + provider_benchmark_id: type: string provider_id: type: string @@ -4596,21 +4813,21 @@ components: - type: object additionalProperties: false required: + - benchmark_id - dataset_id - - dataset_schema - - url - RegisterEvalTaskRequest: + - scoring_functions + RegisterDatasetRequest: type: object properties: - eval_task_id: - type: string dataset_id: type: string - scoring_functions: - type: array - items: - type: string - provider_eval_task_id: + dataset_schema: + type: object + additionalProperties: + $ref: '#/components/schemas/ParamType' + url: + $ref: '#/components/schemas/URL' + provider_dataset_id: type: string provider_id: type: string @@ -4626,9 +4843,9 @@ components: - type: object additionalProperties: false required: - - eval_task_id - dataset_id - - scoring_functions + - dataset_schema + - url RegisterModelRequest: type: object properties: @@ -4739,18 +4956,10 @@ components: type: object properties: task_config: - $ref: '#/components/schemas/EvalTaskConfig' + $ref: '#/components/schemas/BenchmarkConfig' additionalProperties: false required: - task_config - Job: - type: object - properties: - job_id: - type: string - additionalProperties: false - required: - - job_id RunShieldRequest: type: object properties: @@ -5049,10 +5258,10 @@ tags: x-displayName: >- Agents API for creating and interacting with agentic systems. - name: BatchInference (Coming Soon) + - name: Benchmarks - name: DatasetIO - name: Datasets - name: Eval - - name: EvalTasks - name: Inference description: >- This API provides the raw interface to the underlying models. Two kinds of models @@ -5083,10 +5292,10 @@ x-tagGroups: tags: - Agents - BatchInference (Coming Soon) + - Benchmarks - DatasetIO - Datasets - Eval - - EvalTasks - Inference - Inspect - Models diff --git a/docs/getting_started.ipynb b/docs/getting_started.ipynb index abe537c8e1..ee616b4716 100644 --- a/docs/getting_started.ipynb +++ b/docs/getting_started.ipynb @@ -324,7 +324,7 @@ "- vector_io\n", "container_image: null\n", "datasets: []\n", - "eval_tasks: []\n", + "benchmarks: []\n", "image_name: together\n", "metadata_store:\n", " db_path: /Users/ashwin/.llama/distributions/together/registry.db\n", @@ -508,7 +508,7 @@ "- vector_io\n", "container_image: null\n", "datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", - "eval_tasks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", + "benchmarks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", "image_name: together\n", "metadata_store:\n", " db_path: \u001b[35m/Users/ashwin/.llama/distributions/together/\u001b[0m\u001b[95mregistry.db\u001b[0m\n", diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb index 84da252469..8eecf84abb 100644 --- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb +++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb @@ -370,7 +370,7 @@ "- tool_runtime\n", "datasets: []\n", "container_image: null\n", - "eval_tasks: []\n", + "benchmarks: []\n", "image_name: together\n", "memory_banks: []\n", "metadata_store:\n", @@ -551,7 +551,7 @@ "- tool_runtime\n", "datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", "container_image: null\n", - "eval_tasks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", + "benchmarks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", "image_name: together\n", "memory_banks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", "metadata_store:\n", diff --git a/docs/openapi_generator/pyopenapi/generator.py b/docs/openapi_generator/pyopenapi/generator.py index a0385cae00..0f3b997848 100644 --- a/docs/openapi_generator/pyopenapi/generator.py +++ b/docs/openapi_generator/pyopenapi/generator.py @@ -647,6 +647,7 @@ def _build_operation(self, op: EndpointOperation) -> Operation: description = "\n".join( filter(None, [doc_string.short_description, doc_string.long_description]) ) + return Operation( tags=[op.defining_class.__name__], summary=None, @@ -656,6 +657,7 @@ def _build_operation(self, op: EndpointOperation) -> Operation: requestBody=requestBody, responses=responses, callbacks=callbacks, + deprecated=True if "DEPRECATED" in op.func_name else None, security=[] if op.public else None, ) diff --git a/docs/openapi_generator/pyopenapi/specification.py b/docs/openapi_generator/pyopenapi/specification.py index 4b54295c56..f96de58b69 100644 --- a/docs/openapi_generator/pyopenapi/specification.py +++ b/docs/openapi_generator/pyopenapi/specification.py @@ -117,6 +117,7 @@ class Operation: requestBody: Optional[RequestBody] = None callbacks: Optional[Dict[str, "Callback"]] = None security: Optional[List["SecurityRequirement"]] = None + deprecated: Optional[bool] = None @dataclass diff --git a/docs/source/building_applications/evals.md b/docs/source/building_applications/evals.md index c4cb476e4f..f28e0d5fd7 100644 --- a/docs/source/building_applications/evals.md +++ b/docs/source/building_applications/evals.md @@ -41,14 +41,14 @@ system_message = { "content": SYSTEM_PROMPT_TEMPLATE, } -client.eval_tasks.register( - eval_task_id="meta-reference::mmmu", +client.benchmarks.register( + benchmark_id="meta-reference::mmmu", dataset_id=f"mmmu-{subset}-{split}", scoring_functions=["basic::regex_parser_multiple_choice_answer"], ) response = client.eval.evaluate_rows( - task_id="meta-reference::mmmu", + benchmark_id="meta-reference::mmmu", input_rows=eval_rows, scoring_functions=["basic::regex_parser_multiple_choice_answer"], task_config={ @@ -99,14 +99,14 @@ eval_rows = client.datasetio.get_rows_paginated( ``` ```python -client.eval_tasks.register( - eval_task_id="meta-reference::simpleqa", +client.benchmarks.register( + benchmark_id="meta-reference::simpleqa", dataset_id=simpleqa_dataset_id, scoring_functions=["llm-as-judge::405b-simpleqa"], ) response = client.eval.evaluate_rows( - task_id="meta-reference::simpleqa", + benchmark_id="meta-reference::simpleqa", input_rows=eval_rows.rows, scoring_functions=["llm-as-judge::405b-simpleqa"], task_config={ @@ -156,7 +156,7 @@ agent_config = { } response = client.eval.evaluate_rows( - task_id="meta-reference::simpleqa", + benchmark_id="meta-reference::simpleqa", input_rows=eval_rows.rows, scoring_functions=["llm-as-judge::405b-simpleqa"], task_config={ diff --git a/docs/source/building_applications/evaluation.md b/docs/source/building_applications/evaluation.md index 91e5c552bd..ad220f7518 100644 --- a/docs/source/building_applications/evaluation.md +++ b/docs/source/building_applications/evaluation.md @@ -10,15 +10,15 @@ Here's how to set up basic evaluation: ```python # Create an evaluation task -response = client.eval_tasks.register( - eval_task_id="my_eval", +response = client.benchmarks.register( + benchmark_id="my_eval", dataset_id="my_dataset", scoring_functions=["accuracy", "relevance"], ) # Run evaluation job = client.eval.run_eval( - task_id="my_eval", + benchmark_id="my_eval", task_config={ "type": "app", "eval_candidate": {"type": "agent", "config": agent_config}, @@ -26,5 +26,5 @@ job = client.eval.run_eval( ) # Get results -result = client.eval.job_result(task_id="my_eval", job_id=job.job_id) +result = client.eval.job_result(benchmark_id="my_eval", job_id=job.job_id) ``` diff --git a/docs/source/concepts/evaluation_concepts.md b/docs/source/concepts/evaluation_concepts.md index 399d99d92d..3ca4b0ac8e 100644 --- a/docs/source/concepts/evaluation_concepts.md +++ b/docs/source/concepts/evaluation_concepts.md @@ -5,7 +5,7 @@ The Llama Stack Evaluation flow allows you to run evaluations on your GenAI appl We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications. - `/datasetio` + `/datasets` API - `/scoring` + `/scoring_functions` API -- `/eval` + `/eval_tasks` API +- `/eval` + `/benchmarks` API This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing). @@ -21,7 +21,7 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo - **Scoring**: evaluate outputs of the system. - Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics. - **Eval**: generate outputs (via Inference or Agents) and perform scoring. - - Associated with `EvalTask` resource. + - Associated with `Benchmark` resource. Use the following decision tree to decide how to use LlamaStack Evaluation flow. diff --git a/docs/source/concepts/index.md b/docs/source/concepts/index.md index 1437ec6232..403e47c489 100644 --- a/docs/source/concepts/index.md +++ b/docs/source/concepts/index.md @@ -42,7 +42,7 @@ Some of these APIs are associated with a set of **Resources**. Here is the mappi - **Tool Runtime** is associated with `ToolGroup` resources. - **DatasetIO** is associated with `Dataset` resources. - **Scoring** is associated with `ScoringFunction` resources. -- **Eval** is associated with `Model` and `EvalTask` resources. +- **Eval** is associated with `Model` and `Benchmark` resources. Furthermore, we allow these resources to be **federated** across multiple providers. For example, you may have some Llama models served by Fireworks while others are served by AWS Bedrock. Regardless, they will all work seamlessly with the same uniform Inference API provided by Llama Stack. diff --git a/docs/source/playground/index.md b/docs/source/playground/index.md index d74bf1a03b..9691609abf 100644 --- a/docs/source/playground/index.md +++ b/docs/source/playground/index.md @@ -64,7 +64,7 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie ``` ```bash - $ llama-stack-client eval_tasks register \ + $ llama-stack-client benchmarks register \ --eval-task-id meta-reference-mmlu \ --provider-id meta-reference \ --dataset-id mmlu \ @@ -86,7 +86,7 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie - Under the hood, it uses Llama Stack's `/providers` API to get information about the providers. - **API Resources**: Inspect Llama Stack API resources - - This page allows you to inspect Llama Stack API resources (`models`, `datasets`, `memory_banks`, `eval_tasks`, `shields`). + - This page allows you to inspect Llama Stack API resources (`models`, `datasets`, `memory_banks`, `benchmarks`, `shields`). - Under the hood, it uses Llama Stack's `//list` API to get information about each resources. - Please visit [Core Concepts](https://llama-stack.readthedocs.io/en/latest/concepts/index.html) for more details about the resources. diff --git a/docs/source/references/evals_reference/index.md b/docs/source/references/evals_reference/index.md index 86f66208af..71dbb47e59 100644 --- a/docs/source/references/evals_reference/index.md +++ b/docs/source/references/evals_reference/index.md @@ -5,7 +5,7 @@ The Llama Stack Evaluation flow allows you to run evaluations on your GenAI appl We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications. - `/datasetio` + `/datasets` API - `/scoring` + `/scoring_functions` API -- `/eval` + `/eval_tasks` API +- `/eval` + `/benchmarks` API This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing). @@ -21,7 +21,7 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo - **Scoring**: evaluate outputs of the system. - Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics. - **Eval**: generate outputs (via Inference or Agents) and perform scoring. - - Associated with `EvalTask` resource. + - Associated with `Benchmark` resource. Use the following decision tree to decide how to use LlamaStack Evaluation flow. @@ -77,14 +77,14 @@ system_message = { "content": SYSTEM_PROMPT_TEMPLATE, } -client.eval_tasks.register( - eval_task_id="meta-reference::mmmu", +client.benchmarks.register( + benchmark_id="meta-reference::mmmu", dataset_id=f"mmmu-{subset}-{split}", scoring_functions=["basic::regex_parser_multiple_choice_answer"], ) response = client.eval.evaluate_rows( - task_id="meta-reference::mmmu", + benchmark_id="meta-reference::mmmu", input_rows=eval_rows, scoring_functions=["basic::regex_parser_multiple_choice_answer"], task_config={ @@ -135,14 +135,14 @@ eval_rows = client.datasetio.get_rows_paginated( ``` ```python -client.eval_tasks.register( - eval_task_id="meta-reference::simpleqa", +client.benchmarks.register( + benchmark_id="meta-reference::simpleqa", dataset_id=simpleqa_dataset_id, scoring_functions=["llm-as-judge::405b-simpleqa"], ) response = client.eval.evaluate_rows( - task_id="meta-reference::simpleqa", + benchmark_id="meta-reference::simpleqa", input_rows=eval_rows.rows, scoring_functions=["llm-as-judge::405b-simpleqa"], task_config={ @@ -192,7 +192,7 @@ agent_config = { } response = client.eval.evaluate_rows( - task_id="meta-reference::simpleqa", + benchmark_id="meta-reference::simpleqa", input_rows=eval_rows.rows, scoring_functions=["llm-as-judge::405b-simpleqa"], task_config={ @@ -281,7 +281,7 @@ The following examples give the quick steps to start running evaluations using t #### Benchmark Evaluation CLI Usage: There are 2 inputs necessary for running a benchmark eval -- `eval-task-id`: the identifier associated with the eval task. Each `EvalTask` is parametrized by +- `eval-task-id`: the identifier associated with the eval task. Each `Benchmark` is parametrized by - `dataset_id`: the identifier associated with the dataset. - `List[scoring_function_id]`: list of scoring function identifiers. - `eval-task-config`: specifies the configuration of the model / agent to evaluate on. @@ -289,7 +289,7 @@ Usage: There are 2 inputs necessary for running a benchmark eval ``` llama-stack-client eval run_benchmark \ ---eval-task-config ~/eval_task_config.json \ +--eval-task-config ~/benchmark_config.json \ --visualize ``` @@ -309,15 +309,15 @@ llama-stack-client eval run_scoring ... --dataset-id --scoring-functions [ ...] [--provider-id ] [--provider-eval-task-id ] [--metadata ] +$ llama-stack-client benchmarks register --eval-task-id --dataset-id --scoring-functions [ ...] [--provider-id ] [--provider-eval-task-id ] [--metadata ] ``` Options: @@ -191,7 +191,7 @@ Options: - `--num-examples`: Optional. Number of examples to evaluate (useful for debugging) - `--visualize`: Optional flag. If set, visualizes evaluation results after completion -Example eval_task_config.json: +Example benchmark_config.json: ```json { "type": "benchmark", diff --git a/docs/source/references/python_sdk_reference/index.md b/docs/source/references/python_sdk_reference/index.md index 8a06e22442..9d1130422f 100644 --- a/docs/source/references/python_sdk_reference/index.md +++ b/docs/source/references/python_sdk_reference/index.md @@ -181,8 +181,8 @@ from llama_stack_client.types import EvaluateResponse, Job Methods: -- client.eval.evaluate_rows(task_id, \*\*params) -> EvaluateResponse -- client.eval.run_eval(task_id, \*\*params) -> Job +- client.eval.evaluate_rows(benchmark_id, \*\*params) -> EvaluateResponse +- client.eval.run_eval(benchmark_id, \*\*params) -> Job ### Jobs @@ -194,9 +194,9 @@ from llama_stack_client.types.eval import JobStatusResponse Methods: -- client.eval.jobs.retrieve(job_id, \*, task_id) -> EvaluateResponse -- client.eval.jobs.cancel(job_id, \*, task_id) -> None -- client.eval.jobs.status(job_id, \*, task_id) -> Optional[JobStatusResponse] +- client.eval.jobs.retrieve(job_id, \*, benchmark_id) -> EvaluateResponse +- client.eval.jobs.cancel(job_id, \*, benchmark_id) -> None +- client.eval.jobs.status(job_id, \*, benchmark_id) -> Optional[JobStatusResponse] ## Inspect @@ -443,20 +443,20 @@ Methods: - client.scoring_functions.list() -> ScoringFunctionListResponse - client.scoring_functions.register(\*\*params) -> None -## EvalTasks +## Benchmarks Types: ```python from llama_stack_client.types import ( - EvalTask, - ListEvalTasksResponse, - EvalTaskListResponse, + Benchmark, + ListBenchmarksResponse, + BenchmarkListResponse, ) ``` Methods: -- client.eval_tasks.retrieve(eval_task_id) -> Optional[EvalTask] -- client.eval_tasks.list() -> EvalTaskListResponse -- client.eval_tasks.register(\*\*params) -> None +- client.benchmarks.retrieve(benchmark_id) -> Optional[Benchmark] +- client.benchmarks.list() -> BenchmarkListResponse +- client.benchmarks.register(\*\*params) -> None diff --git a/llama_stack/apis/eval_tasks/__init__.py b/llama_stack/apis/benchmarks/__init__.py similarity index 81% rename from llama_stack/apis/eval_tasks/__init__.py rename to llama_stack/apis/benchmarks/__init__.py index 7ca2167068..f8f5649570 100644 --- a/llama_stack/apis/eval_tasks/__init__.py +++ b/llama_stack/apis/benchmarks/__init__.py @@ -4,4 +4,4 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from .eval_tasks import * # noqa: F401 F403 +from .benchmarks import * # noqa: F401 F403 diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py new file mode 100644 index 0000000000..50019b18c7 --- /dev/null +++ b/llama_stack/apis/benchmarks/benchmarks.py @@ -0,0 +1,86 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable + +from llama_models.schema_utils import json_schema_type, webmethod +from pydantic import BaseModel, Field + +from llama_stack.apis.resource import Resource, ResourceType + + +class CommonBenchmarkFields(BaseModel): + dataset_id: str + scoring_functions: List[str] + metadata: Dict[str, Any] = Field( + default_factory=dict, + description="Metadata for this evaluation task", + ) + + +@json_schema_type +class Benchmark(CommonBenchmarkFields, Resource): + type: Literal[ResourceType.benchmark.value] = ResourceType.benchmark.value + + @property + def benchmark_id(self) -> str: + return self.identifier + + @property + def provider_benchmark_id(self) -> str: + return self.provider_resource_id + + +class BenchmarkInput(CommonBenchmarkFields, BaseModel): + benchmark_id: str + provider_id: Optional[str] = None + provider_benchmark_id: Optional[str] = None + + +class ListBenchmarksResponse(BaseModel): + data: List[Benchmark] + + +@runtime_checkable +class Benchmarks(Protocol): + @webmethod(route="/eval/benchmarks", method="GET") + async def list_benchmarks(self) -> ListBenchmarksResponse: ... + + @webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET") + async def get_benchmark( + self, + benchmark_id: str, + ) -> Optional[Benchmark]: ... + + @webmethod(route="/eval/benchmarks", method="POST") + async def register_benchmark( + self, + benchmark_id: str, + dataset_id: str, + scoring_functions: List[str], + provider_benchmark_id: Optional[str] = None, + provider_id: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> None: ... + + @webmethod(route="/eval-tasks", method="GET") + async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse: ... + + @webmethod(route="/eval-tasks/{task_id}", method="GET") + async def DEPRECATED_get_eval_task( + self, + eval_task_id: str, + ) -> Optional[Benchmark]: ... + + @webmethod(route="/eval-tasks", method="POST") + async def DEPRECATED_register_eval_task( + self, + eval_task_id: str, + dataset_id: str, + scoring_functions: List[str], + provider_benchmark_id: Optional[str] = None, + provider_id: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> None: ... diff --git a/llama_stack/apis/datatypes.py b/llama_stack/apis/datatypes.py index ccc395b80b..0751b2c9b2 100644 --- a/llama_stack/apis/datatypes.py +++ b/llama_stack/apis/datatypes.py @@ -28,7 +28,7 @@ class Api(Enum): vector_dbs = "vector_dbs" datasets = "datasets" scoring_functions = "scoring_functions" - eval_tasks = "eval_tasks" + benchmarks = "benchmarks" tool_groups = "tool_groups" # built-in API diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py index ae13a5bd95..e5c7821503 100644 --- a/llama_stack/apis/eval/eval.py +++ b/llama_stack/apis/eval/eval.py @@ -38,19 +38,9 @@ class AgentCandidate(BaseModel): @json_schema_type -class BenchmarkEvalTaskConfig(BaseModel): +class BenchmarkConfig(BaseModel): type: Literal["benchmark"] = "benchmark" eval_candidate: EvalCandidate - num_examples: Optional[int] = Field( - description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated", - default=None, - ) - - -@json_schema_type -class AppEvalTaskConfig(BaseModel): - type: Literal["app"] = "app" - eval_candidate: EvalCandidate scoring_params: Dict[str, ScoringFnParams] = Field( description="Map between scoring function id and parameters for each scoring function you want to run", default_factory=dict, @@ -62,12 +52,6 @@ class AppEvalTaskConfig(BaseModel): # we could optinally add any specific dataset config here -EvalTaskConfig = register_schema( - Annotated[Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type")], - name="EvalTaskConfig", -) - - @json_schema_type class EvaluateResponse(BaseModel): generations: List[Dict[str, Any]] @@ -76,27 +60,52 @@ class EvaluateResponse(BaseModel): class Eval(Protocol): - @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST") + @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST") async def run_eval( + self, + benchmark_id: str, + task_config: BenchmarkConfig, + ) -> Job: ... + + @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST") + async def evaluate_rows( + self, + benchmark_id: str, + input_rows: List[Dict[str, Any]], + scoring_functions: List[str], + task_config: BenchmarkConfig, + ) -> EvaluateResponse: ... + + @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET") + async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: ... + + @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE") + async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ... + + @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET") + async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ... + + @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST") + async def DEPRECATED_run_eval( self, task_id: str, - task_config: EvalTaskConfig, + task_config: BenchmarkConfig, ) -> Job: ... @webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST") - async def evaluate_rows( + async def DEPRECATED_evaluate_rows( self, task_id: str, input_rows: List[Dict[str, Any]], scoring_functions: List[str], - task_config: EvalTaskConfig, + task_config: BenchmarkConfig, ) -> EvaluateResponse: ... @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET") - async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ... + async def DEPRECATED_job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ... @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE") - async def job_cancel(self, task_id: str, job_id: str) -> None: ... + async def DEPRECATED_job_cancel(self, task_id: str, job_id: str) -> None: ... @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET") - async def job_result(self, job_id: str, task_id: str) -> EvaluateResponse: ... + async def DEPRECATED_job_result(self, task_id: str, job_id: str) -> EvaluateResponse: ... diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py deleted file mode 100644 index a0a5330553..0000000000 --- a/llama_stack/apis/eval_tasks/eval_tasks.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. -from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable - -from llama_models.schema_utils import json_schema_type, webmethod -from pydantic import BaseModel, Field - -from llama_stack.apis.resource import Resource, ResourceType - - -class CommonEvalTaskFields(BaseModel): - dataset_id: str - scoring_functions: List[str] - metadata: Dict[str, Any] = Field( - default_factory=dict, - description="Metadata for this evaluation task", - ) - - -@json_schema_type -class EvalTask(CommonEvalTaskFields, Resource): - type: Literal[ResourceType.eval_task.value] = ResourceType.eval_task.value - - @property - def eval_task_id(self) -> str: - return self.identifier - - @property - def provider_eval_task_id(self) -> str: - return self.provider_resource_id - - -class EvalTaskInput(CommonEvalTaskFields, BaseModel): - eval_task_id: str - provider_id: Optional[str] = None - provider_eval_task_id: Optional[str] = None - - -class ListEvalTasksResponse(BaseModel): - data: List[EvalTask] - - -@runtime_checkable -class EvalTasks(Protocol): - @webmethod(route="/eval-tasks", method="GET") - async def list_eval_tasks(self) -> ListEvalTasksResponse: ... - - @webmethod(route="/eval-tasks/{eval_task_id}", method="GET") - async def get_eval_task( - self, - eval_task_id: str, - ) -> Optional[EvalTask]: ... - - @webmethod(route="/eval-tasks", method="POST") - async def register_eval_task( - self, - eval_task_id: str, - dataset_id: str, - scoring_functions: List[str], - provider_eval_task_id: Optional[str] = None, - provider_id: Optional[str] = None, - metadata: Optional[Dict[str, Any]] = None, - ) -> None: ... diff --git a/llama_stack/apis/resource.py b/llama_stack/apis/resource.py index 145113a5d6..70ec63c55d 100644 --- a/llama_stack/apis/resource.py +++ b/llama_stack/apis/resource.py @@ -15,7 +15,7 @@ class ResourceType(Enum): vector_db = "vector_db" dataset = "dataset" scoring_function = "scoring_function" - eval_task = "eval_task" + benchmark = "benchmark" tool = "tool" tool_group = "tool_group" diff --git a/llama_stack/apis/telemetry/telemetry.py b/llama_stack/apis/telemetry/telemetry.py index 5622aaeac8..63ae1dc738 100644 --- a/llama_stack/apis/telemetry/telemetry.py +++ b/llama_stack/apis/telemetry/telemetry.py @@ -13,8 +13,8 @@ Literal, Optional, Protocol, - runtime_checkable, Union, + runtime_checkable, ) from llama_models.llama3.api.datatypes import Primitive diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py index 97706f22a5..f62996081b 100644 --- a/llama_stack/distribution/datatypes.py +++ b/llama_stack/distribution/datatypes.py @@ -8,10 +8,10 @@ from pydantic import BaseModel, Field +from llama_stack.apis.benchmarks import Benchmark, BenchmarkInput from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Dataset, DatasetInput from llama_stack.apis.eval import Eval -from llama_stack.apis.eval_tasks import EvalTask, EvalTaskInput from llama_stack.apis.inference import Inference from llama_stack.apis.models import Model, ModelInput from llama_stack.apis.safety import Safety @@ -37,7 +37,7 @@ VectorDB, Dataset, ScoringFn, - EvalTask, + Benchmark, Tool, ToolGroup, ] @@ -50,7 +50,7 @@ VectorDB, Dataset, ScoringFn, - EvalTask, + Benchmark, Tool, ToolGroup, ], @@ -173,7 +173,7 @@ class StackRunConfig(BaseModel): vector_dbs: List[VectorDBInput] = Field(default_factory=list) datasets: List[DatasetInput] = Field(default_factory=list) scoring_fns: List[ScoringFnInput] = Field(default_factory=list) - eval_tasks: List[EvalTaskInput] = Field(default_factory=list) + benchmarks: List[BenchmarkInput] = Field(default_factory=list) tool_groups: List[ToolGroupInput] = Field(default_factory=list) server: ServerConfig = Field( diff --git a/llama_stack/distribution/distribution.py b/llama_stack/distribution/distribution.py index 2dcf38463b..384e2c3c89 100644 --- a/llama_stack/distribution/distribution.py +++ b/llama_stack/distribution/distribution.py @@ -44,7 +44,7 @@ def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]: router_api=Api.scoring, ), AutoRoutedApiInfo( - routing_table_api=Api.eval_tasks, + routing_table_api=Api.benchmarks, router_api=Api.eval, ), AutoRoutedApiInfo( diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py index 353c2971ba..0bc2e774c1 100644 --- a/llama_stack/distribution/resolver.py +++ b/llama_stack/distribution/resolver.py @@ -9,10 +9,10 @@ from typing import Any, Dict, List, Set from llama_stack.apis.agents import Agents +from llama_stack.apis.benchmarks import Benchmarks from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Datasets from llama_stack.apis.eval import Eval -from llama_stack.apis.eval_tasks import EvalTasks from llama_stack.apis.inference import Inference from llama_stack.apis.inspect import Inspect from llama_stack.apis.models import Models @@ -37,8 +37,8 @@ from llama_stack.distribution.utils.dynamic import instantiate_class_type from llama_stack.providers.datatypes import ( Api, + BenchmarksProtocolPrivate, DatasetsProtocolPrivate, - EvalTasksProtocolPrivate, InlineProviderSpec, ModelsProtocolPrivate, ProviderSpec, @@ -73,7 +73,7 @@ def api_protocol_map() -> Dict[Api, Any]: Api.scoring: Scoring, Api.scoring_functions: ScoringFunctions, Api.eval: Eval, - Api.eval_tasks: EvalTasks, + Api.benchmarks: Benchmarks, Api.post_training: PostTraining, Api.tool_groups: ToolGroups, Api.tool_runtime: ToolRuntime, @@ -92,7 +92,7 @@ def additional_protocols_map() -> Dict[Api, Any]: ScoringFunctions, Api.scoring_functions, ), - Api.eval: (EvalTasksProtocolPrivate, EvalTasks, Api.eval_tasks), + Api.eval: (BenchmarksProtocolPrivate, Benchmarks, Api.benchmarks), } diff --git a/llama_stack/distribution/routers/__init__.py b/llama_stack/distribution/routers/__init__.py index 18197ca7f1..a54f57fb30 100644 --- a/llama_stack/distribution/routers/__init__.py +++ b/llama_stack/distribution/routers/__init__.py @@ -11,8 +11,8 @@ from llama_stack.providers.datatypes import Api, RoutingTable from .routing_tables import ( + BenchmarksRoutingTable, DatasetsRoutingTable, - EvalTasksRoutingTable, ModelsRoutingTable, ScoringFunctionsRoutingTable, ShieldsRoutingTable, @@ -33,7 +33,7 @@ async def get_routing_table_impl( "shields": ShieldsRoutingTable, "datasets": DatasetsRoutingTable, "scoring_functions": ScoringFunctionsRoutingTable, - "eval_tasks": EvalTasksRoutingTable, + "benchmarks": BenchmarksRoutingTable, "tool_groups": ToolGroupsRoutingTable, } diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py index e716e44b08..f45975189f 100644 --- a/llama_stack/distribution/routers/routers.py +++ b/llama_stack/distribution/routers/routers.py @@ -9,9 +9,8 @@ from llama_stack.apis.common.content_types import URL, InterleavedContent from llama_stack.apis.datasetio import DatasetIO, PaginatedRowsResult from llama_stack.apis.eval import ( - AppEvalTaskConfig, + BenchmarkConfig, Eval, - EvalTaskConfig, EvaluateResponse, Job, JobStatus, @@ -347,23 +346,23 @@ async def shutdown(self) -> None: async def run_eval( self, - task_id: str, - task_config: AppEvalTaskConfig, + benchmark_id: str, + task_config: BenchmarkConfig, ) -> Job: - return await self.routing_table.get_provider_impl(task_id).run_eval( - task_id=task_id, + return await self.routing_table.get_provider_impl(benchmark_id).run_eval( + benchmark_id=benchmark_id, task_config=task_config, ) async def evaluate_rows( self, - task_id: str, + benchmark_id: str, input_rows: List[Dict[str, Any]], scoring_functions: List[str], - task_config: EvalTaskConfig, + task_config: BenchmarkConfig, ) -> EvaluateResponse: - return await self.routing_table.get_provider_impl(task_id).evaluate_rows( - task_id=task_id, + return await self.routing_table.get_provider_impl(benchmark_id).evaluate_rows( + benchmark_id=benchmark_id, input_rows=input_rows, scoring_functions=scoring_functions, task_config=task_config, @@ -371,31 +370,73 @@ async def evaluate_rows( async def job_status( self, - task_id: str, + benchmark_id: str, job_id: str, ) -> Optional[JobStatus]: - return await self.routing_table.get_provider_impl(task_id).job_status(task_id, job_id) + return await self.routing_table.get_provider_impl(benchmark_id).job_status(benchmark_id, job_id) async def job_cancel( self, - task_id: str, + benchmark_id: str, job_id: str, ) -> None: - await self.routing_table.get_provider_impl(task_id).job_cancel( - task_id, + await self.routing_table.get_provider_impl(benchmark_id).job_cancel( + benchmark_id, job_id, ) async def job_result( self, - task_id: str, + benchmark_id: str, job_id: str, ) -> EvaluateResponse: - return await self.routing_table.get_provider_impl(task_id).job_result( - task_id, + return await self.routing_table.get_provider_impl(benchmark_id).job_result( + benchmark_id, job_id, ) + async def DEPRECATED_run_eval( + self, + task_id: str, + task_config: BenchmarkConfig, + ) -> Job: + return await self.run_eval(benchmark_id=task_id, task_config=task_config) + + async def DEPRECATED_evaluate_rows( + self, + task_id: str, + input_rows: List[Dict[str, Any]], + scoring_functions: List[str], + task_config: BenchmarkConfig, + ) -> EvaluateResponse: + return await self.evaluate_rows( + benchmark_id=task_id, + input_rows=input_rows, + scoring_functions=scoring_functions, + task_config=task_config, + ) + + async def DEPRECATED_job_status( + self, + task_id: str, + job_id: str, + ) -> Optional[JobStatus]: + return await self.job_status(benchmark_id=task_id, job_id=job_id) + + async def DEPRECATED_job_cancel( + self, + task_id: str, + job_id: str, + ) -> None: + return await self.job_cancel(benchmark_id=task_id, job_id=job_id) + + async def DEPRECATED_job_result( + self, + task_id: str, + job_id: str, + ) -> EvaluateResponse: + return await self.job_result(benchmark_id=task_id, job_id=job_id) + class ToolRuntimeRouter(ToolRuntime): class RagToolImpl(RAGToolRuntime): diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index 009775ca52..2cddc3970d 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -4,14 +4,15 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import logging from typing import Any, Dict, List, Optional from pydantic import TypeAdapter +from llama_stack.apis.benchmarks import Benchmark, Benchmarks, ListBenchmarksResponse from llama_stack.apis.common.content_types import URL from llama_stack.apis.common.type_system import ParamType from llama_stack.apis.datasets import Dataset, Datasets, ListDatasetsResponse -from llama_stack.apis.eval_tasks import EvalTask, EvalTasks, ListEvalTasksResponse from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType from llama_stack.apis.resource import ResourceType from llama_stack.apis.scoring_functions import ( @@ -38,6 +39,8 @@ from llama_stack.distribution.store import DistributionRegistry from llama_stack.providers.datatypes import Api, RoutingTable +logger = logging.getLogger(__name__) + def get_impl_api(p: Any) -> Api: return p.__provider_spec__.api @@ -60,7 +63,7 @@ async def register_object_with_provider(obj: RoutableObject, p: Any) -> Routable elif api == Api.scoring: return await p.register_scoring_function(obj) elif api == Api.eval: - return await p.register_eval_task(obj) + return await p.register_benchmark(obj) elif api == Api.tool_runtime: return await p.register_tool(obj) else: @@ -121,7 +124,7 @@ async def add_objects(objs: List[RoutableObjectWithProvider], provider_id: str, scoring_functions = await p.list_scoring_functions() await add_objects(scoring_functions, pid, ScoringFn) elif api == Api.eval: - p.eval_task_store = self + p.benchmark_store = self elif api == Api.tool_runtime: p.tool_store = self @@ -141,8 +144,8 @@ def apiname_object(): return ("DatasetIO", "dataset") elif isinstance(self, ScoringFunctionsRoutingTable): return ("Scoring", "scoring_function") - elif isinstance(self, EvalTasksRoutingTable): - return ("Eval", "eval_task") + elif isinstance(self, BenchmarksRoutingTable): + return ("Eval", "benchmark") elif isinstance(self, ToolGroupsRoutingTable): return ("Tools", "tool") else: @@ -428,20 +431,20 @@ async def register_scoring_function( await self.register_object(scoring_fn) -class EvalTasksRoutingTable(CommonRoutingTableImpl, EvalTasks): - async def list_eval_tasks(self) -> ListEvalTasksResponse: - return ListEvalTasksResponse(data=await self.get_all_with_type("eval_task")) +class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks): + async def list_benchmarks(self) -> ListBenchmarksResponse: + return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark")) - async def get_eval_task(self, eval_task_id: str) -> Optional[EvalTask]: - return await self.get_object_by_identifier("eval_task", eval_task_id) + async def get_benchmark(self, benchmark_id: str) -> Optional[Benchmark]: + return await self.get_object_by_identifier("benchmark", benchmark_id) - async def register_eval_task( + async def register_benchmark( self, - eval_task_id: str, + benchmark_id: str, dataset_id: str, scoring_functions: List[str], metadata: Optional[Dict[str, Any]] = None, - provider_eval_task_id: Optional[str] = None, + provider_benchmark_id: Optional[str] = None, provider_id: Optional[str] = None, ) -> None: if metadata is None: @@ -453,17 +456,46 @@ async def register_eval_task( raise ValueError( "No provider specified and multiple providers available. Please specify a provider_id." ) - if provider_eval_task_id is None: - provider_eval_task_id = eval_task_id - eval_task = EvalTask( - identifier=eval_task_id, + if provider_benchmark_id is None: + provider_benchmark_id = benchmark_id + benchmark = Benchmark( + identifier=benchmark_id, dataset_id=dataset_id, scoring_functions=scoring_functions, metadata=metadata, provider_id=provider_id, - provider_resource_id=provider_eval_task_id, + provider_resource_id=provider_benchmark_id, + ) + await self.register_object(benchmark) + + async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse: + logger.warning("DEPRECATED: Use /eval/benchmarks instead") + return await self.list_benchmarks() + + async def DEPRECATED_get_eval_task( + self, + eval_task_id: str, + ) -> Optional[Benchmark]: + logger.warning("DEPRECATED: Use /eval/benchmarks instead") + return await self.get_benchmark(eval_task_id) + + async def DEPRECATED_register_eval_task( + self, + eval_task_id: str, + dataset_id: str, + scoring_functions: List[str], + provider_benchmark_id: Optional[str] = None, + provider_id: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> None: + logger.warning("DEPRECATED: Use /eval/benchmarks instead") + return await self.register_benchmark( + benchmark_id=eval_task_id, + dataset_id=dataset_id, + scoring_functions=scoring_functions, + metadata=metadata, + provider_benchmark_id=provider_benchmark_id, ) - await self.register_object(eval_task) class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups): diff --git a/llama_stack/distribution/stack.py b/llama_stack/distribution/stack.py index 2baad8ac45..9335dc3a95 100644 --- a/llama_stack/distribution/stack.py +++ b/llama_stack/distribution/stack.py @@ -15,10 +15,10 @@ from llama_stack.apis.agents import Agents from llama_stack.apis.batch_inference import BatchInference +from llama_stack.apis.benchmarks import Benchmarks from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Datasets from llama_stack.apis.eval import Eval -from llama_stack.apis.eval_tasks import EvalTasks from llama_stack.apis.inference import Inference from llama_stack.apis.inspect import Inspect from llama_stack.apis.models import Models @@ -53,7 +53,7 @@ class LlamaStack( PostTraining, VectorIO, Eval, - EvalTasks, + Benchmarks, Scoring, ScoringFunctions, DatasetIO, @@ -78,7 +78,7 @@ class LlamaStack( "register_scoring_function", "list_scoring_functions", ), - ("eval_tasks", Api.eval_tasks, "register_eval_task", "list_eval_tasks"), + ("benchmarks", Api.benchmarks, "register_benchmark", "list_benchmarks"), ("tool_groups", Api.tool_groups, "register_tool_group", "list_tool_groups"), ] diff --git a/llama_stack/distribution/ui/README.md b/llama_stack/distribution/ui/README.md index c0a2597af5..8fceb5c63c 100644 --- a/llama_stack/distribution/ui/README.md +++ b/llama_stack/distribution/ui/README.md @@ -26,7 +26,7 @@ $ llama-stack-client datasets register \ ``` ```bash -$ llama-stack-client eval_tasks register \ +$ llama-stack-client benchmarks register \ --eval-task-id meta-reference-mmlu \ --provider-id meta-reference \ --dataset-id mmlu \ diff --git a/llama_stack/distribution/ui/page/distribution/eval_tasks.py b/llama_stack/distribution/ui/page/distribution/eval_tasks.py index f589696631..1428ae9ab2 100644 --- a/llama_stack/distribution/ui/page/distribution/eval_tasks.py +++ b/llama_stack/distribution/ui/page/distribution/eval_tasks.py @@ -8,12 +8,12 @@ from modules.api import llama_stack_api -def eval_tasks(): - # Eval Tasks Section - st.header("Eval Tasks") +def benchmarks(): + # Benchmarks Section + st.header("Benchmarks") - eval_tasks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.eval_tasks.list()} + benchmarks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.benchmarks.list()} - if len(eval_tasks_info) > 0: - selected_eval_task = st.selectbox("Select an eval task", list(eval_tasks_info.keys()), key="eval_task_inspect") - st.json(eval_tasks_info[selected_eval_task], expanded=True) + if len(benchmarks_info) > 0: + selected_benchmark = st.selectbox("Select an eval task", list(benchmarks_info.keys()), key="benchmark_inspect") + st.json(benchmarks_info[selected_benchmark], expanded=True) diff --git a/llama_stack/distribution/ui/page/distribution/resources.py b/llama_stack/distribution/ui/page/distribution/resources.py index 94b840bcb8..684270d4de 100644 --- a/llama_stack/distribution/ui/page/distribution/resources.py +++ b/llama_stack/distribution/ui/page/distribution/resources.py @@ -4,8 +4,8 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from page.distribution.benchmarks import benchmarks from page.distribution.datasets import datasets -from page.distribution.eval_tasks import eval_tasks from page.distribution.models import models from page.distribution.scoring_functions import scoring_functions from page.distribution.shields import shields @@ -20,7 +20,7 @@ def resources_page(): "Shields", "Scoring Functions", "Datasets", - "Eval Tasks", + "Benchmarks", ] icons = ["magic", "memory", "shield", "file-bar-graph", "database", "list-task"] selected_resource = option_menu( @@ -34,8 +34,8 @@ def resources_page(): }, }, ) - if selected_resource == "Eval Tasks": - eval_tasks() + if selected_resource == "Benchmarks": + benchmarks() elif selected_resource == "Vector Databases": vector_dbs() elif selected_resource == "Datasets": diff --git a/llama_stack/distribution/ui/page/evaluations/native_eval.py b/llama_stack/distribution/ui/page/evaluations/native_eval.py index 112d9cff02..f1cae714a9 100644 --- a/llama_stack/distribution/ui/page/evaluations/native_eval.py +++ b/llama_stack/distribution/ui/page/evaluations/native_eval.py @@ -11,28 +11,28 @@ from modules.api import llama_stack_api -def select_eval_task_1(): - # Select Eval Tasks +def select_benchmark_1(): + # Select Benchmarks st.subheader("1. Choose An Eval Task") - eval_tasks = llama_stack_api.client.eval_tasks.list() - eval_tasks = {et.identifier: et for et in eval_tasks} - eval_tasks_names = list(eval_tasks.keys()) - selected_eval_task = st.selectbox( + benchmarks = llama_stack_api.client.benchmarks.list() + benchmarks = {et.identifier: et for et in benchmarks} + benchmarks_names = list(benchmarks.keys()) + selected_benchmark = st.selectbox( "Choose an eval task.", - options=eval_tasks_names, + options=benchmarks_names, help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.", ) with st.expander("View Eval Task"): - st.json(eval_tasks[selected_eval_task], expanded=True) + st.json(benchmarks[selected_benchmark], expanded=True) - st.session_state["selected_eval_task"] = selected_eval_task - st.session_state["eval_tasks"] = eval_tasks + st.session_state["selected_benchmark"] = selected_benchmark + st.session_state["benchmarks"] = benchmarks if st.button("Confirm", key="confirm_1"): - st.session_state["selected_eval_task_1_next"] = True + st.session_state["selected_benchmark_1_next"] = True def define_eval_candidate_2(): - if not st.session_state.get("selected_eval_task_1_next", None): + if not st.session_state.get("selected_benchmark_1_next", None): return st.subheader("2. Define Eval Candidate") @@ -161,11 +161,11 @@ def run_evaluation_3(): Review the configurations that will be used for this evaluation run, make any necessary changes, and then click the "Run Evaluation" button. """ ) - selected_eval_task = st.session_state["selected_eval_task"] - eval_tasks = st.session_state["eval_tasks"] + selected_benchmark = st.session_state["selected_benchmark"] + benchmarks = st.session_state["benchmarks"] eval_candidate = st.session_state["eval_candidate"] - dataset_id = eval_tasks[selected_eval_task].dataset_id + dataset_id = benchmarks[selected_benchmark].dataset_id rows = llama_stack_api.client.datasetio.get_rows_paginated( dataset_id=dataset_id, rows_in_page=-1, @@ -180,16 +180,16 @@ def run_evaluation_3(): help="Number of examples from the dataset to evaluate. ", ) - eval_task_config = { + benchmark_config = { "type": "benchmark", "eval_candidate": eval_candidate, "scoring_params": {}, } with st.expander("View Evaluation Task", expanded=True): - st.json(eval_tasks[selected_eval_task], expanded=True) + st.json(benchmarks[selected_benchmark], expanded=True) with st.expander("View Evaluation Task Configuration", expanded=True): - st.json(eval_task_config, expanded=True) + st.json(benchmark_config, expanded=True) # Add run button and handle evaluation if st.button("Run Evaluation"): @@ -209,10 +209,10 @@ def run_evaluation_3(): progress_bar.progress(progress, text=progress_text) # Run evaluation for current row eval_res = llama_stack_api.client.eval.evaluate_rows( - task_id=selected_eval_task, + benchmark_id=selected_benchmark, input_rows=[r], - scoring_functions=eval_tasks[selected_eval_task].scoring_functions, - task_config=eval_task_config, + scoring_functions=benchmarks[selected_benchmark].scoring_functions, + task_config=benchmark_config, ) for k in r.keys(): @@ -225,7 +225,7 @@ def run_evaluation_3(): output_res[k] = [] output_res[k].append(eval_res.generations[0][k]) - for scoring_fn in eval_tasks[selected_eval_task].scoring_functions: + for scoring_fn in benchmarks[selected_benchmark].scoring_functions: if scoring_fn not in output_res: output_res[scoring_fn] = [] output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0]) @@ -245,7 +245,7 @@ def native_evaluation_page(): st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙") st.title("📊 Evaluations (Generation + Scoring)") - select_eval_task_1() + select_benchmark_1() define_eval_candidate_2() run_evaluation_3() diff --git a/llama_stack/providers/datatypes.py b/llama_stack/providers/datatypes.py index ccdaf76e74..b92f9dc0a0 100644 --- a/llama_stack/providers/datatypes.py +++ b/llama_stack/providers/datatypes.py @@ -10,9 +10,9 @@ from llama_models.schema_utils import json_schema_type from pydantic import BaseModel, Field +from llama_stack.apis.benchmarks import Benchmark from llama_stack.apis.datasets import Dataset from llama_stack.apis.datatypes import Api -from llama_stack.apis.eval_tasks import EvalTask from llama_stack.apis.models import Model from llama_stack.apis.scoring_functions import ScoringFn from llama_stack.apis.shields import Shield @@ -48,8 +48,8 @@ async def list_scoring_functions(self) -> List[ScoringFn]: ... async def register_scoring_function(self, scoring_fn: ScoringFn) -> None: ... -class EvalTasksProtocolPrivate(Protocol): - async def register_eval_task(self, eval_task: EvalTask) -> None: ... +class BenchmarksProtocolPrivate(Protocol): + async def register_benchmark(self, benchmark: Benchmark) -> None: ... class ToolsProtocolPrivate(Protocol): diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py index 1c44caf7f0..cd99c9ad89 100644 --- a/llama_stack/providers/inline/eval/meta_reference/eval.py +++ b/llama_stack/providers/inline/eval/meta_reference/eval.py @@ -8,13 +8,13 @@ from tqdm import tqdm from llama_stack.apis.agents import Agents, StepType +from llama_stack.apis.benchmarks import Benchmark from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Datasets -from llama_stack.apis.eval_tasks import EvalTask from llama_stack.apis.inference import Inference, UserMessage from llama_stack.apis.scoring import Scoring from llama_stack.distribution.datatypes import Api -from llama_stack.providers.datatypes import EvalTasksProtocolPrivate +from llama_stack.providers.datatypes import BenchmarksProtocolPrivate from llama_stack.providers.inline.agents.meta_reference.agent_instance import ( MEMORY_QUERY_TOOL, ) @@ -26,15 +26,15 @@ from llama_stack.providers.utils.kvstore import kvstore_impl from .....apis.common.job_types import Job -from .....apis.eval.eval import Eval, EvalTaskConfig, EvaluateResponse, JobStatus +from .....apis.eval.eval import BenchmarkConfig, Eval, EvaluateResponse, JobStatus from .config import MetaReferenceEvalConfig -EVAL_TASKS_PREFIX = "eval_tasks:" +EVAL_TASKS_PREFIX = "benchmarks:" class MetaReferenceEvalImpl( Eval, - EvalTasksProtocolPrivate, + BenchmarksProtocolPrivate, ): def __init__( self, @@ -55,36 +55,36 @@ def __init__( # TODO: assume sync job, will need jobs API for async scheduling self.jobs = {} - self.eval_tasks = {} + self.benchmarks = {} async def initialize(self) -> None: self.kvstore = await kvstore_impl(self.config.kvstore) - # Load existing eval_tasks from kvstore + # Load existing benchmarks from kvstore start_key = EVAL_TASKS_PREFIX end_key = f"{EVAL_TASKS_PREFIX}\xff" - stored_eval_tasks = await self.kvstore.range(start_key, end_key) + stored_benchmarks = await self.kvstore.range(start_key, end_key) - for eval_task in stored_eval_tasks: - eval_task = EvalTask.model_validate_json(eval_task) - self.eval_tasks[eval_task.identifier] = eval_task + for benchmark in stored_benchmarks: + benchmark = Benchmark.model_validate_json(benchmark) + self.benchmarks[benchmark.identifier] = benchmark async def shutdown(self) -> None: ... - async def register_eval_task(self, task_def: EvalTask) -> None: + async def register_benchmark(self, task_def: Benchmark) -> None: # Store in kvstore key = f"{EVAL_TASKS_PREFIX}{task_def.identifier}" await self.kvstore.set( key=key, value=task_def.model_dump_json(), ) - self.eval_tasks[task_def.identifier] = task_def + self.benchmarks[task_def.identifier] = task_def async def run_eval( self, - task_id: str, - task_config: EvalTaskConfig, + benchmark_id: str, + task_config: BenchmarkConfig, ) -> Job: - task_def = self.eval_tasks[task_id] + task_def = self.benchmarks[benchmark_id] dataset_id = task_def.dataset_id candidate = task_config.eval_candidate scoring_functions = task_def.scoring_functions @@ -95,7 +95,7 @@ async def run_eval( rows_in_page=(-1 if task_config.num_examples is None else task_config.num_examples), ) res = await self.evaluate_rows( - task_id=task_id, + benchmark_id=benchmark_id, input_rows=all_rows.rows, scoring_functions=scoring_functions, task_config=task_config, @@ -108,7 +108,7 @@ async def run_eval( return Job(job_id=job_id) async def _run_agent_generation( - self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig + self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig ) -> List[Dict[str, Any]]: candidate = task_config.eval_candidate create_response = await self.agents_api.create_agent(candidate.config) @@ -151,7 +151,7 @@ async def _run_agent_generation( return generations async def _run_model_generation( - self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig + self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig ) -> List[Dict[str, Any]]: candidate = task_config.eval_candidate assert candidate.sampling_params.max_tokens is not None, "SamplingParams.max_tokens must be provided" @@ -187,10 +187,10 @@ async def _run_model_generation( async def evaluate_rows( self, - task_id: str, + benchmark_id: str, input_rows: List[Dict[str, Any]], scoring_functions: List[str], - task_config: EvalTaskConfig, + task_config: BenchmarkConfig, ) -> EvaluateResponse: candidate = task_config.eval_candidate if candidate.type == "agent": @@ -203,7 +203,7 @@ async def evaluate_rows( # scoring with generated_answer score_input_rows = [input_r | generated_r for input_r, generated_r in zip(input_rows, generations)] - if task_config.type == "app" and task_config.scoring_params is not None: + if task_config.scoring_params is not None: scoring_functions_dict = { scoring_fn_id: task_config.scoring_params.get(scoring_fn_id, None) for scoring_fn_id in scoring_functions @@ -217,18 +217,60 @@ async def evaluate_rows( return EvaluateResponse(generations=generations, scores=score_response.results) - async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: + async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: if job_id in self.jobs: return JobStatus.completed return None - async def job_cancel(self, task_id: str, job_id: str) -> None: + async def job_cancel(self, benchmark_id: str, job_id: str) -> None: raise NotImplementedError("Job cancel is not implemented yet") - async def job_result(self, task_id: str, job_id: str) -> EvaluateResponse: - status = await self.job_status(task_id, job_id) + async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: + status = await self.job_status(benchmark_id, job_id) if not status or status != JobStatus.completed: raise ValueError(f"Job is not completed, Status: {status.value}") return self.jobs[job_id] + + async def DEPRECATED_run_eval( + self, + task_id: str, + task_config: BenchmarkConfig, + ) -> Job: + return await self.run_eval(benchmark_id=task_id, task_config=task_config) + + async def DEPRECATED_evaluate_rows( + self, + task_id: str, + input_rows: List[Dict[str, Any]], + scoring_functions: List[str], + task_config: BenchmarkConfig, + ) -> EvaluateResponse: + return await self.evaluate_rows( + benchmark_id=task_id, + input_rows=input_rows, + scoring_functions=scoring_functions, + task_config=task_config, + ) + + async def DEPRECATED_job_status( + self, + task_id: str, + job_id: str, + ) -> Optional[JobStatus]: + return await self.job_status(benchmark_id=task_id, job_id=job_id) + + async def DEPRECATED_job_cancel( + self, + task_id: str, + job_id: str, + ) -> None: + return await self.job_cancel(benchmark_id=task_id, job_id=job_id) + + async def DEPRECATED_job_result( + self, + task_id: str, + job_id: str, + ) -> EvaluateResponse: + return await self.job_result(benchmark_id=task_id, job_id=job_id) diff --git a/llama_stack/providers/tests/eval/test_eval.py b/llama_stack/providers/tests/eval/test_eval.py index ec3d08728b..ad80b8601c 100644 --- a/llama_stack/providers/tests/eval/test_eval.py +++ b/llama_stack/providers/tests/eval/test_eval.py @@ -10,8 +10,8 @@ from llama_stack.apis.common.content_types import URL from llama_stack.apis.common.type_system import ChatCompletionInputType, StringType from llama_stack.apis.eval.eval import ( - AppEvalTaskConfig, - BenchmarkEvalTaskConfig, + AppBenchmarkConfig, + BenchmarkBenchmarkConfig, ModelCandidate, ) from llama_stack.apis.inference import SamplingParams @@ -30,18 +30,18 @@ class Testeval: @pytest.mark.asyncio - async def test_eval_tasks_list(self, eval_stack): + async def test_benchmarks_list(self, eval_stack): # NOTE: this needs you to ensure that you are starting from a clean state # but so far we don't have an unregister API unfortunately, so be careful - eval_tasks_impl = eval_stack[Api.eval_tasks] - response = await eval_tasks_impl.list_eval_tasks() + benchmarks_impl = eval_stack[Api.benchmarks] + response = await benchmarks_impl.list_benchmarks() assert isinstance(response, list) @pytest.mark.asyncio async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model): - eval_impl, eval_tasks_impl, datasetio_impl, datasets_impl, models_impl = ( + eval_impl, benchmarks_impl, datasetio_impl, datasets_impl, models_impl = ( eval_stack[Api.eval], - eval_stack[Api.eval_tasks], + eval_stack[Api.benchmarks], eval_stack[Api.datasetio], eval_stack[Api.datasets], eval_stack[Api.models], @@ -59,17 +59,17 @@ async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model scoring_functions = [ "basic::equality", ] - task_id = "meta-reference::app_eval" - await eval_tasks_impl.register_eval_task( - eval_task_id=task_id, + benchmark_id = "meta-reference::app_eval" + await benchmarks_impl.register_benchmark( + benchmark_id=benchmark_id, dataset_id="test_dataset_for_eval", scoring_functions=scoring_functions, ) response = await eval_impl.evaluate_rows( - task_id=task_id, + benchmark_id=benchmark_id, input_rows=rows.rows, scoring_functions=scoring_functions, - task_config=AppEvalTaskConfig( + task_config=AppBenchmarkConfig( eval_candidate=ModelCandidate( model=inference_model, sampling_params=SamplingParams(), @@ -92,9 +92,9 @@ async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model @pytest.mark.asyncio async def test_eval_run_eval(self, eval_stack, inference_model, judge_model): - eval_impl, eval_tasks_impl, datasets_impl, models_impl = ( + eval_impl, benchmarks_impl, datasets_impl, models_impl = ( eval_stack[Api.eval], - eval_stack[Api.eval_tasks], + eval_stack[Api.benchmarks], eval_stack[Api.datasets], eval_stack[Api.models], ) @@ -105,15 +105,15 @@ async def test_eval_run_eval(self, eval_stack, inference_model, judge_model): "basic::subset_of", ] - task_id = "meta-reference::app_eval-2" - await eval_tasks_impl.register_eval_task( - eval_task_id=task_id, + benchmark_id = "meta-reference::app_eval-2" + await benchmarks_impl.register_benchmark( + benchmark_id=benchmark_id, dataset_id="test_dataset_for_eval", scoring_functions=scoring_functions, ) response = await eval_impl.run_eval( - task_id=task_id, - task_config=AppEvalTaskConfig( + benchmark_id=benchmark_id, + task_config=AppBenchmarkConfig( eval_candidate=ModelCandidate( model=inference_model, sampling_params=SamplingParams(), @@ -121,9 +121,9 @@ async def test_eval_run_eval(self, eval_stack, inference_model, judge_model): ), ) assert response.job_id == "0" - job_status = await eval_impl.job_status(task_id, response.job_id) + job_status = await eval_impl.job_status(benchmark_id, response.job_id) assert job_status and job_status.value == "completed" - eval_response = await eval_impl.job_result(task_id, response.job_id) + eval_response = await eval_impl.job_result(benchmark_id, response.job_id) assert eval_response is not None assert len(eval_response.generations) == 5 @@ -131,9 +131,9 @@ async def test_eval_run_eval(self, eval_stack, inference_model, judge_model): @pytest.mark.asyncio async def test_eval_run_benchmark_eval(self, eval_stack, inference_model): - eval_impl, eval_tasks_impl, datasets_impl, models_impl = ( + eval_impl, benchmarks_impl, datasets_impl, models_impl = ( eval_stack[Api.eval], - eval_stack[Api.eval_tasks], + eval_stack[Api.benchmarks], eval_stack[Api.datasets], eval_stack[Api.models], ) @@ -159,20 +159,20 @@ async def test_eval_run_benchmark_eval(self, eval_stack, inference_model): ) # register eval task - await eval_tasks_impl.register_eval_task( - eval_task_id="meta-reference-mmlu", + await benchmarks_impl.register_benchmark( + benchmark_id="meta-reference-mmlu", dataset_id="mmlu", scoring_functions=["basic::regex_parser_multiple_choice_answer"], ) # list benchmarks - response = await eval_tasks_impl.list_eval_tasks() + response = await benchmarks_impl.list_benchmarks() assert len(response) > 0 benchmark_id = "meta-reference-mmlu" response = await eval_impl.run_eval( - task_id=benchmark_id, - task_config=BenchmarkEvalTaskConfig( + benchmark_id=benchmark_id, + task_config=BenchmarkBenchmarkConfig( eval_candidate=ModelCandidate( model=inference_model, sampling_params=SamplingParams(), diff --git a/llama_stack/providers/tests/resolver.py b/llama_stack/providers/tests/resolver.py index 0ff6327170..76343b7f48 100644 --- a/llama_stack/providers/tests/resolver.py +++ b/llama_stack/providers/tests/resolver.py @@ -10,8 +10,8 @@ from pydantic import BaseModel +from llama_stack.apis.benchmarks import BenchmarkInput from llama_stack.apis.datasets import DatasetInput -from llama_stack.apis.eval_tasks import EvalTaskInput from llama_stack.apis.models import ModelInput from llama_stack.apis.scoring_functions import ScoringFnInput from llama_stack.apis.shields import ShieldInput @@ -42,7 +42,7 @@ async def construct_stack_for_test( vector_dbs: Optional[List[VectorDBInput]] = None, datasets: Optional[List[DatasetInput]] = None, scoring_fns: Optional[List[ScoringFnInput]] = None, - eval_tasks: Optional[List[EvalTaskInput]] = None, + benchmarks: Optional[List[BenchmarkInput]] = None, tool_groups: Optional[List[ToolGroupInput]] = None, ) -> TestStack: sqlite_file = tempfile.NamedTemporaryFile(delete=False, suffix=".db") @@ -56,7 +56,7 @@ async def construct_stack_for_test( vector_dbs=vector_dbs or [], datasets=datasets or [], scoring_fns=scoring_fns or [], - eval_tasks=eval_tasks or [], + benchmarks=benchmarks or [], tool_groups=tool_groups or [], ) run_config = parse_and_maybe_upgrade_config(run_config) diff --git a/llama_stack/templates/bedrock/run.yaml b/llama_stack/templates/bedrock/run.yaml index be6c9a928c..7d03b7c29a 100644 --- a/llama_stack/templates/bedrock/run.yaml +++ b/llama_stack/templates/bedrock/run.yaml @@ -107,7 +107,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/cerebras/run.yaml b/llama_stack/templates/cerebras/run.yaml index 05d3f45259..6afff2be21 100644 --- a/llama_stack/templates/cerebras/run.yaml +++ b/llama_stack/templates/cerebras/run.yaml @@ -109,7 +109,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/dell/run-with-safety.yaml b/llama_stack/templates/dell/run-with-safety.yaml index 04c5957d46..ddec3a7153 100644 --- a/llama_stack/templates/dell/run-with-safety.yaml +++ b/llama_stack/templates/dell/run-with-safety.yaml @@ -108,7 +108,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: brave-search diff --git a/llama_stack/templates/dell/run.yaml b/llama_stack/templates/dell/run.yaml index 706444eb1b..9394c94efe 100644 --- a/llama_stack/templates/dell/run.yaml +++ b/llama_stack/templates/dell/run.yaml @@ -99,7 +99,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: brave-search diff --git a/llama_stack/templates/experimental-post-training/run.yaml b/llama_stack/templates/experimental-post-training/run.yaml index 75d103c9fa..e70ccdd2de 100644 --- a/llama_stack/templates/experimental-post-training/run.yaml +++ b/llama_stack/templates/experimental-post-training/run.yaml @@ -85,4 +85,4 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] diff --git a/llama_stack/templates/fireworks/run-with-safety.yaml b/llama_stack/templates/fireworks/run-with-safety.yaml index 0fbe14a5a5..8f95e9d59b 100644 --- a/llama_stack/templates/fireworks/run-with-safety.yaml +++ b/llama_stack/templates/fireworks/run-with-safety.yaml @@ -164,7 +164,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml index ccf67dcbb0..64229a5d86 100644 --- a/llama_stack/templates/fireworks/run.yaml +++ b/llama_stack/templates/fireworks/run.yaml @@ -153,7 +153,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/hf-endpoint/run-with-safety.yaml b/llama_stack/templates/hf-endpoint/run-with-safety.yaml index f520a2fdab..867d7a0768 100644 --- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml +++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml @@ -116,7 +116,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/hf-endpoint/run.yaml b/llama_stack/templates/hf-endpoint/run.yaml index 708cb1bcc6..d60acdefd9 100644 --- a/llama_stack/templates/hf-endpoint/run.yaml +++ b/llama_stack/templates/hf-endpoint/run.yaml @@ -106,7 +106,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/hf-serverless/run-with-safety.yaml b/llama_stack/templates/hf-serverless/run-with-safety.yaml index 7f0abf5be0..e58ad15b34 100644 --- a/llama_stack/templates/hf-serverless/run-with-safety.yaml +++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml @@ -116,7 +116,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/hf-serverless/run.yaml b/llama_stack/templates/hf-serverless/run.yaml index c0b7a4c605..5045e821af 100644 --- a/llama_stack/templates/hf-serverless/run.yaml +++ b/llama_stack/templates/hf-serverless/run.yaml @@ -106,7 +106,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml index c5286fc6be..caac65c8c3 100644 --- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml +++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml @@ -118,7 +118,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/meta-reference-gpu/run.yaml b/llama_stack/templates/meta-reference-gpu/run.yaml index 310585f23f..bade9a076f 100644 --- a/llama_stack/templates/meta-reference-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-gpu/run.yaml @@ -107,7 +107,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml index d43cf3917e..f131e8ea65 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml @@ -109,7 +109,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml index c8ae362f54..14fb283544 100644 --- a/llama_stack/templates/nvidia/run.yaml +++ b/llama_stack/templates/nvidia/run.yaml @@ -139,7 +139,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml index ac5dab7552..9d5bfc7a0e 100644 --- a/llama_stack/templates/ollama/run-with-safety.yaml +++ b/llama_stack/templates/ollama/run-with-safety.yaml @@ -113,7 +113,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml index 3a60fe61f1..9ac1f3267e 100644 --- a/llama_stack/templates/ollama/run.yaml +++ b/llama_stack/templates/ollama/run.yaml @@ -110,7 +110,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml index 1fe998a1f9..dd43f21f67 100644 --- a/llama_stack/templates/remote-vllm/run-with-safety.yaml +++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml @@ -118,7 +118,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml index 9d3db8a31e..24cd207c7f 100644 --- a/llama_stack/templates/remote-vllm/run.yaml +++ b/llama_stack/templates/remote-vllm/run.yaml @@ -107,7 +107,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/sambanova/run.yaml b/llama_stack/templates/sambanova/run.yaml index 39b0f3c4e8..26815dcd06 100644 --- a/llama_stack/templates/sambanova/run.yaml +++ b/llama_stack/templates/sambanova/run.yaml @@ -118,7 +118,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml index ed6c9ef6f2..e1d85f59ac 100644 --- a/llama_stack/templates/tgi/run-with-safety.yaml +++ b/llama_stack/templates/tgi/run-with-safety.yaml @@ -106,7 +106,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml index 8bf76f37b1..fc73e09789 100644 --- a/llama_stack/templates/tgi/run.yaml +++ b/llama_stack/templates/tgi/run.yaml @@ -105,7 +105,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/together/run-with-safety.yaml b/llama_stack/templates/together/run-with-safety.yaml index 2989266307..f101a5d600 100644 --- a/llama_stack/templates/together/run-with-safety.yaml +++ b/llama_stack/templates/together/run-with-safety.yaml @@ -159,7 +159,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml index 920003759c..8af85979d7 100644 --- a/llama_stack/templates/together/run.yaml +++ b/llama_stack/templates/together/run.yaml @@ -148,7 +148,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml index 41a545e1ae..cdce5510d1 100644 --- a/llama_stack/templates/vllm-gpu/run.yaml +++ b/llama_stack/templates/vllm-gpu/run.yaml @@ -109,7 +109,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search