diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 98270f7b86..b93f6a380a 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -40,256 +40,229 @@
         }
     ],
     "paths": {
-        "/v1/datasetio/rows": {
-            "get": {
+        "/v1/eval/tasks/{task_id}/evaluations": {
+            "post": {
                 "responses": {
                     "200": {
                         "description": "OK",
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/PaginatedRowsResult"
+                                    "$ref": "#/components/schemas/EvaluateResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "DatasetIO"
+                    "Eval"
                 ],
                 "description": "",
                 "parameters": [
                     {
-                        "name": "dataset_id",
-                        "in": "query",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "rows_in_page",
-                        "in": "query",
+                        "name": "task_id",
+                        "in": "path",
                         "required": true,
-                        "schema": {
-                            "type": "integer"
-                        }
-                    },
-                    {
-                        "name": "page_token",
-                        "in": "query",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "filter_condition",
-                        "in": "query",
-                        "required": false,
                         "schema": {
                             "type": "string"
                         }
                     }
-                ]
-            },
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "OK"
-                    }
-                },
-                "tags": [
-                    "DatasetIO"
                 ],
-                "description": "",
-                "parameters": [],
                 "requestBody": {
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/AppendRowsRequest"
+                                "$ref": "#/components/schemas/DeprecatedEvaluateRowsRequest"
                             }
                         }
                     },
                     "required": true
-                }
+                },
+                "deprecated": true
             }
         },
-        "/v1/batch-inference/chat-completion": {
-            "post": {
+        "/v1/eval-tasks/{task_id}": {
+            "get": {
                 "responses": {
                     "200": {
                         "description": "OK",
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/BatchChatCompletionResponse"
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/Benchmark"
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "BatchInference (Coming Soon)"
+                    "Benchmarks"
                 ],
                 "description": "",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/BatchChatCompletionRequest"
-                            }
+                "parameters": [
+                    {
+                        "name": "eval_task_id",
+                        "in": "query",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
                         }
-                    },
-                    "required": true
-                }
+                    }
+                ],
+                "deprecated": true
             }
         },
-        "/v1/batch-inference/completion": {
-            "post": {
+        "/v1/eval/tasks/{task_id}/jobs/{job_id}": {
+            "get": {
                 "responses": {
                     "200": {
                         "description": "OK",
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/BatchCompletionResponse"
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/JobStatus"
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "BatchInference (Coming Soon)"
+                    "Eval"
                 ],
                 "description": "",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/BatchCompletionRequest"
-                            }
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
                         }
                     },
-                    "required": true
-                }
-            }
-        },
-        "/v1/post-training/job/cancel": {
-            "post": {
+                    {
+                        "name": "job_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "deprecated": true
+            },
+            "delete": {
                 "responses": {
                     "200": {
                         "description": "OK"
                     }
                 },
                 "tags": [
-                    "PostTraining (Coming Soon)"
+                    "Eval"
                 ],
                 "description": "",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/CancelTrainingJobRequest"
-                            }
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
                         }
                     },
-                    "required": true
-                }
+                    {
+                        "name": "job_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "deprecated": true
             }
         },
-        "/v1/inference/chat-completion": {
-            "post": {
+        "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": {
+            "get": {
                 "responses": {
                     "200": {
-                        "description": "If stream=False, returns a ChatCompletionResponse with the full completion. If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk",
+                        "description": "OK",
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/ChatCompletionResponse"
-                                }
-                            },
-                            "text/event-stream": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/ChatCompletionResponseStreamChunk"
+                                    "$ref": "#/components/schemas/EvaluateResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Inference"
+                    "Eval"
                 ],
-                "description": "Generate a chat completion for the given messages using the specified model.",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/ChatCompletionRequest"
-                            }
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
                         }
                     },
-                    "required": true
-                }
+                    {
+                        "name": "job_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "deprecated": true
             }
         },
-        "/v1/inference/completion": {
-            "post": {
+        "/v1/eval-tasks": {
+            "get": {
                 "responses": {
                     "200": {
-                        "description": "If stream=False, returns a CompletionResponse with the full completion. If stream=True, returns an SSE event stream of CompletionResponseStreamChunk",
+                        "description": "OK",
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/CompletionResponse"
-                                }
-                            },
-                            "text/event-stream": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/CompletionResponseStreamChunk"
+                                    "$ref": "#/components/schemas/ListBenchmarksResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Inference"
+                    "Benchmarks"
                 ],
-                "description": "Generate a completion for the given content using the specified model.",
+                "description": "",
                 "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/CompletionRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
-        "/v1/agents": {
+                "deprecated": true
+            },
             "post": {
                 "responses": {
                     "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/AgentCreateResponse"
-                                }
-                            }
-                        }
+                        "description": "OK"
                     }
                 },
                 "tags": [
-                    "Agents"
+                    "Benchmarks"
                 ],
                 "description": "",
                 "parameters": [],
@@ -297,15 +270,16 @@
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/CreateAgentRequest"
+                                "$ref": "#/components/schemas/DeprecatedRegisterEvalTaskRequest"
                             }
                         }
                     },
                     "required": true
-                }
+                },
+                "deprecated": true
             }
         },
-        "/v1/agents/{agent_id}/session": {
+        "/v1/eval/tasks/{task_id}/jobs": {
             "post": {
                 "responses": {
                     "200": {
@@ -313,19 +287,19 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/AgentSessionCreateResponse"
+                                    "$ref": "#/components/schemas/Job"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Agents"
+                    "Eval"
                 ],
                 "description": "",
                 "parameters": [
                     {
-                        "name": "agent_id",
+                        "name": "task_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -337,60 +311,84 @@
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/CreateAgentSessionRequest"
+                                "$ref": "#/components/schemas/DeprecatedRunEvalRequest"
                             }
                         }
                     },
                     "required": true
-                }
+                },
+                "deprecated": true
             }
         },
-        "/v1/agents/{agent_id}/session/{session_id}/turn": {
-            "post": {
+        "/v1/datasetio/rows": {
+            "get": {
                 "responses": {
                     "200": {
-                        "description": "A single turn in an interaction with an Agentic System. **OR** streamed agent turn completion response.",
+                        "description": "OK",
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/Turn"
-                                }
-                            },
-                            "text/event-stream": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/AgentTurnResponseStreamChunk"
+                                    "$ref": "#/components/schemas/PaginatedRowsResult"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Agents"
+                    "DatasetIO"
                 ],
                 "description": "",
                 "parameters": [
                     {
-                        "name": "agent_id",
-                        "in": "path",
+                        "name": "dataset_id",
+                        "in": "query",
                         "required": true,
                         "schema": {
                             "type": "string"
                         }
                     },
                     {
-                        "name": "session_id",
-                        "in": "path",
+                        "name": "rows_in_page",
+                        "in": "query",
                         "required": true,
+                        "schema": {
+                            "type": "integer"
+                        }
+                    },
+                    {
+                        "name": "page_token",
+                        "in": "query",
+                        "required": false,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "filter_condition",
+                        "in": "query",
+                        "required": false,
                         "schema": {
                             "type": "string"
                         }
                     }
+                ]
+            },
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    }
+                },
+                "tags": [
+                    "DatasetIO"
                 ],
+                "description": "",
+                "parameters": [],
                 "requestBody": {
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/CreateAgentTurnRequest"
+                                "$ref": "#/components/schemas/AppendRowsRequest"
                             }
                         }
                     },
@@ -398,116 +396,106 @@
                 }
             }
         },
-        "/v1/agents/{agent_id}": {
-            "delete": {
+        "/v1/batch-inference/chat-completion": {
+            "post": {
                 "responses": {
                     "200": {
-                        "description": "OK"
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/BatchChatCompletionResponse"
+                                }
+                            }
+                        }
                     }
                 },
                 "tags": [
-                    "Agents"
+                    "BatchInference (Coming Soon)"
                 ],
                 "description": "",
-                "parameters": [
-                    {
-                        "name": "agent_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/BatchChatCompletionRequest"
+                            }
                         }
-                    }
-                ]
+                    },
+                    "required": true
+                }
             }
         },
-        "/v1/agents/{agent_id}/session/{session_id}": {
-            "get": {
+        "/v1/batch-inference/completion": {
+            "post": {
                 "responses": {
                     "200": {
                         "description": "OK",
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/Session"
+                                    "$ref": "#/components/schemas/BatchCompletionResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Agents"
+                    "BatchInference (Coming Soon)"
                 ],
                 "description": "",
-                "parameters": [
-                    {
-                        "name": "session_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "agent_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "turn_ids",
-                        "in": "query",
-                        "required": false,
-                        "schema": {
-                            "type": "array",
-                            "items": {
-                                "type": "string"
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/BatchCompletionRequest"
                             }
                         }
-                    }
-                ]
-            },
-            "delete": {
+                    },
+                    "required": true
+                }
+            }
+        },
+        "/v1/post-training/job/cancel": {
+            "post": {
                 "responses": {
                     "200": {
                         "description": "OK"
                     }
                 },
                 "tags": [
-                    "Agents"
+                    "PostTraining (Coming Soon)"
                 ],
                 "description": "",
-                "parameters": [
-                    {
-                        "name": "session_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/CancelTrainingJobRequest"
+                            }
                         }
                     },
-                    {
-                        "name": "agent_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
+                    "required": true
+                }
             }
         },
-        "/v1/inference/embeddings": {
+        "/v1/inference/chat-completion": {
             "post": {
                 "responses": {
                     "200": {
-                        "description": "An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}",
+                        "description": "If stream=False, returns a ChatCompletionResponse with the full completion. If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk",
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/EmbeddingsResponse"
+                                    "$ref": "#/components/schemas/ChatCompletionResponse"
+                                }
+                            },
+                            "text/event-stream": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ChatCompletionResponseStreamChunk"
                                 }
                             }
                         }
@@ -516,13 +504,13 @@
                 "tags": [
                     "Inference"
                 ],
-                "description": "Generate embeddings for content pieces using the specified model.",
+                "description": "Generate a chat completion for the given messages using the specified model.",
                 "parameters": [],
                 "requestBody": {
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/EmbeddingsRequest"
+                                "$ref": "#/components/schemas/ChatCompletionRequest"
                             }
                         }
                     },
@@ -530,39 +518,35 @@
                 }
             }
         },
-        "/v1/eval/tasks/{task_id}/evaluations": {
+        "/v1/inference/completion": {
             "post": {
                 "responses": {
                     "200": {
-                        "description": "OK",
+                        "description": "If stream=False, returns a CompletionResponse with the full completion. If stream=True, returns an SSE event stream of CompletionResponseStreamChunk",
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/EvaluateResponse"
+                                    "$ref": "#/components/schemas/CompletionResponse"
+                                }
+                            },
+                            "text/event-stream": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/CompletionResponseStreamChunk"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Eval"
-                ],
-                "description": "",
-                "parameters": [
-                    {
-                        "name": "task_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
+                    "Inference"
                 ],
+                "description": "Generate a completion for the given content using the specified model.",
+                "parameters": [],
                 "requestBody": {
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/EvaluateRowsRequest"
+                                "$ref": "#/components/schemas/CompletionRequest"
                             }
                         }
                     },
@@ -570,15 +554,15 @@
                 }
             }
         },
-        "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}": {
-            "get": {
+        "/v1/agents": {
+            "post": {
                 "responses": {
                     "200": {
                         "description": "OK",
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/AgentStepResponse"
+                                    "$ref": "#/components/schemas/AgentCreateResponse"
                                 }
                             }
                         }
@@ -588,51 +572,28 @@
                     "Agents"
                 ],
                 "description": "",
-                "parameters": [
-                    {
-                        "name": "agent_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "session_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "turn_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/CreateAgentRequest"
+                            }
                         }
                     },
-                    {
-                        "name": "step_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
+                    "required": true
+                }
             }
         },
-        "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}": {
-            "get": {
+        "/v1/agents/{agent_id}/session": {
+            "post": {
                 "responses": {
                     "200": {
                         "description": "OK",
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/Turn"
+                                    "$ref": "#/components/schemas/AgentSessionCreateResponse"
                                 }
                             }
                         }
@@ -650,112 +611,87 @@
                         "schema": {
                             "type": "string"
                         }
-                    },
-                    {
-                        "name": "session_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
+                    }
+                ],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/CreateAgentSessionRequest"
+                            }
                         }
                     },
-                    {
-                        "name": "turn_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
+                    "required": true
+                }
             }
         },
-        "/v1/datasets/{dataset_id}": {
-            "get": {
+        "/v1/agents/{agent_id}/session/{session_id}/turn": {
+            "post": {
                 "responses": {
                     "200": {
-                        "description": "OK",
+                        "description": "A single turn in an interaction with an Agentic System. **OR** streamed agent turn completion response.",
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/Dataset"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
+                                    "$ref": "#/components/schemas/Turn"
+                                }
+                            },
+                            "text/event-stream": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/AgentTurnResponseStreamChunk"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Datasets"
+                    "Agents"
                 ],
                 "description": "",
                 "parameters": [
                     {
-                        "name": "dataset_id",
+                        "name": "agent_id",
                         "in": "path",
                         "required": true,
                         "schema": {
                             "type": "string"
                         }
-                    }
-                ]
-            },
-            "delete": {
-                "responses": {
-                    "200": {
-                        "description": "OK"
-                    }
-                },
-                "tags": [
-                    "Datasets"
-                ],
-                "description": "",
-                "parameters": [
+                    },
                     {
-                        "name": "dataset_id",
+                        "name": "session_id",
                         "in": "path",
                         "required": true,
                         "schema": {
                             "type": "string"
                         }
                     }
-                ]
+                ],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/CreateAgentTurnRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
             }
         },
-        "/v1/eval-tasks/{eval_task_id}": {
-            "get": {
+        "/v1/agents/{agent_id}": {
+            "delete": {
                 "responses": {
                     "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/EvalTask"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
-                                }
-                            }
-                        }
+                        "description": "OK"
                     }
                 },
                 "tags": [
-                    "EvalTasks"
+                    "Agents"
                 ],
                 "description": "",
                 "parameters": [
                     {
-                        "name": "eval_task_id",
+                        "name": "agent_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -765,7 +701,7 @@
                 ]
             }
         },
-        "/v1/models/{model_id}": {
+        "/v1/agents/{agent_id}/session/{session_id}": {
             "get": {
                 "responses": {
                     "200": {
@@ -773,31 +709,43 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/Model"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
+                                    "$ref": "#/components/schemas/Session"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Models"
+                    "Agents"
                 ],
                 "description": "",
                 "parameters": [
                     {
-                        "name": "model_id",
+                        "name": "session_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "agent_id",
                         "in": "path",
                         "required": true,
                         "schema": {
                             "type": "string"
                         }
+                    },
+                    {
+                        "name": "turn_ids",
+                        "in": "query",
+                        "required": false,
+                        "schema": {
+                            "type": "array",
+                            "items": {
+                                "type": "string"
+                            }
+                        }
                     }
                 ]
             },
@@ -808,12 +756,20 @@
                     }
                 },
                 "tags": [
-                    "Models"
+                    "Agents"
                 ],
                 "description": "",
                 "parameters": [
                     {
-                        "name": "model_id",
+                        "name": "session_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "agent_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -823,81 +779,78 @@
                 ]
             }
         },
-        "/v1/scoring-functions/{scoring_fn_id}": {
-            "get": {
+        "/v1/inference/embeddings": {
+            "post": {
                 "responses": {
                     "200": {
-                        "description": "OK",
+                        "description": "An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}",
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/ScoringFn"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
+                                    "$ref": "#/components/schemas/EmbeddingsResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "ScoringFunctions"
+                    "Inference"
                 ],
-                "description": "",
-                "parameters": [
-                    {
-                        "name": "scoring_fn_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
+                "description": "Generate embeddings for content pieces using the specified model.",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/EmbeddingsRequest"
+                            }
                         }
-                    }
-                ]
+                    },
+                    "required": true
+                }
             }
         },
-        "/v1/shields/{identifier}": {
-            "get": {
+        "/v1/eval/benchmarks/{benchmark_id}/evaluations": {
+            "post": {
                 "responses": {
                     "200": {
                         "description": "OK",
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/Shield"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
+                                    "$ref": "#/components/schemas/EvaluateResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Shields"
+                    "Eval"
                 ],
                 "description": "",
                 "parameters": [
                     {
-                        "name": "identifier",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
                             "type": "string"
                         }
                     }
-                ]
+                ],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/EvaluateRowsRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
             }
         },
-        "/v1/telemetry/traces/{trace_id}/spans/{span_id}": {
+        "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}": {
             "get": {
                 "responses": {
                     "200": {
@@ -905,19 +858,19 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/Span"
+                                    "$ref": "#/components/schemas/AgentStepResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Telemetry"
+                    "Agents"
                 ],
                 "description": "",
                 "parameters": [
                     {
-                        "name": "trace_id",
+                        "name": "agent_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -925,7 +878,23 @@
                         }
                     },
                     {
-                        "name": "span_id",
+                        "name": "session_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "turn_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "step_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -935,7 +904,7 @@
                 ]
             }
         },
-        "/v1/telemetry/spans/{span_id}/tree": {
+        "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}": {
             "get": {
                 "responses": {
                     "200": {
@@ -943,19 +912,19 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/QuerySpanTreeResponse"
+                                    "$ref": "#/components/schemas/Turn"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Telemetry"
+                    "Agents"
                 ],
                 "description": "",
                 "parameters": [
                     {
-                        "name": "span_id",
+                        "name": "agent_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -963,28 +932,25 @@
                         }
                     },
                     {
-                        "name": "attributes_to_return",
-                        "in": "query",
-                        "required": false,
+                        "name": "session_id",
+                        "in": "path",
+                        "required": true,
                         "schema": {
-                            "type": "array",
-                            "items": {
-                                "type": "string"
-                            }
+                            "type": "string"
                         }
                     },
                     {
-                        "name": "max_depth",
-                        "in": "query",
-                        "required": false,
+                        "name": "turn_id",
+                        "in": "path",
+                        "required": true,
                         "schema": {
-                            "type": "integer"
+                            "type": "string"
                         }
                     }
                 ]
             }
         },
-        "/v1/tools/{tool_name}": {
+        "/v1/eval/benchmarks/{benchmark_id}": {
             "get": {
                 "responses": {
                     "200": {
@@ -992,19 +958,26 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/Tool"
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/Benchmark"
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "ToolGroups"
+                    "Benchmarks"
                 ],
                 "description": "",
                 "parameters": [
                     {
-                        "name": "tool_name",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -1014,7 +987,7 @@
                 ]
             }
         },
-        "/v1/toolgroups/{toolgroup_id}": {
+        "/v1/datasets/{dataset_id}": {
             "get": {
                 "responses": {
                     "200": {
@@ -1022,19 +995,26 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/ToolGroup"
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/Dataset"
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "ToolGroups"
+                    "Datasets"
                 ],
                 "description": "",
                 "parameters": [
                     {
-                        "name": "toolgroup_id",
+                        "name": "dataset_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -1050,12 +1030,12 @@
                     }
                 },
                 "tags": [
-                    "ToolGroups"
+                    "Datasets"
                 ],
-                "description": "Unregister a tool group",
+                "description": "",
                 "parameters": [
                     {
-                        "name": "toolgroup_id",
+                        "name": "dataset_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -1065,7 +1045,7 @@
                 ]
             }
         },
-        "/v1/telemetry/traces/{trace_id}": {
+        "/v1/models/{model_id}": {
             "get": {
                 "responses": {
                     "200": {
@@ -1073,19 +1053,47 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/Trace"
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/Model"
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Telemetry"
+                    "Models"
                 ],
                 "description": "",
                 "parameters": [
                     {
-                        "name": "trace_id",
+                        "name": "model_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            },
+            "delete": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    }
+                },
+                "tags": [
+                    "Models"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "model_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -1095,7 +1103,7 @@
                 ]
             }
         },
-        "/v1/post-training/job/artifacts": {
+        "/v1/scoring-functions/{scoring_fn_id}": {
             "get": {
                 "responses": {
                     "200": {
@@ -1105,7 +1113,7 @@
                                 "schema": {
                                     "oneOf": [
                                         {
-                                            "$ref": "#/components/schemas/PostTrainingJobArtifactsResponse"
+                                            "$ref": "#/components/schemas/ScoringFn"
                                         },
                                         {
                                             "type": "null"
@@ -1117,13 +1125,13 @@
                     }
                 },
                 "tags": [
-                    "PostTraining (Coming Soon)"
+                    "ScoringFunctions"
                 ],
                 "description": "",
                 "parameters": [
                     {
-                        "name": "job_uuid",
-                        "in": "query",
+                        "name": "scoring_fn_id",
+                        "in": "path",
                         "required": true,
                         "schema": {
                             "type": "string"
@@ -1132,7 +1140,7 @@
                 ]
             }
         },
-        "/v1/post-training/job/status": {
+        "/v1/shields/{identifier}": {
             "get": {
                 "responses": {
                     "200": {
@@ -1142,7 +1150,7 @@
                                 "schema": {
                                     "oneOf": [
                                         {
-                                            "$ref": "#/components/schemas/PostTrainingJobStatusResponse"
+                                            "$ref": "#/components/schemas/Shield"
                                         },
                                         {
                                             "type": "null"
@@ -1154,13 +1162,13 @@
                     }
                 },
                 "tags": [
-                    "PostTraining (Coming Soon)"
+                    "Shields"
                 ],
                 "description": "",
                 "parameters": [
                     {
-                        "name": "job_uuid",
-                        "in": "query",
+                        "name": "identifier",
+                        "in": "path",
                         "required": true,
                         "schema": {
                             "type": "string"
@@ -1169,7 +1177,7 @@
                 ]
             }
         },
-        "/v1/post-training/jobs": {
+        "/v1/telemetry/traces/{trace_id}/spans/{span_id}": {
             "get": {
                 "responses": {
                     "200": {
@@ -1177,20 +1185,37 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/ListPostTrainingJobsResponse"
+                                    "$ref": "#/components/schemas/Span"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "PostTraining (Coming Soon)"
+                    "Telemetry"
                 ],
                 "description": "",
-                "parameters": []
+                "parameters": [
+                    {
+                        "name": "trace_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "span_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
             }
         },
-        "/v1/vector-dbs/{vector_db_id}": {
+        "/v1/telemetry/spans/{span_id}/tree": {
             "get": {
                 "responses": {
                     "200": {
@@ -1198,47 +1223,68 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/VectorDB"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
+                                    "$ref": "#/components/schemas/QuerySpanTreeResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "VectorDBs"
+                    "Telemetry"
                 ],
                 "description": "",
                 "parameters": [
                     {
-                        "name": "vector_db_id",
+                        "name": "span_id",
                         "in": "path",
                         "required": true,
                         "schema": {
                             "type": "string"
                         }
+                    },
+                    {
+                        "name": "attributes_to_return",
+                        "in": "query",
+                        "required": false,
+                        "schema": {
+                            "type": "array",
+                            "items": {
+                                "type": "string"
+                            }
+                        }
+                    },
+                    {
+                        "name": "max_depth",
+                        "in": "query",
+                        "required": false,
+                        "schema": {
+                            "type": "integer"
+                        }
                     }
                 ]
-            },
-            "delete": {
+            }
+        },
+        "/v1/tools/{tool_name}": {
+            "get": {
                 "responses": {
                     "200": {
-                        "description": "OK"
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Tool"
+                                }
+                            }
+                        }
                     }
                 },
                 "tags": [
-                    "VectorDBs"
+                    "ToolGroups"
                 ],
                 "description": "",
                 "parameters": [
                     {
-                        "name": "vector_db_id",
+                        "name": "tool_name",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -1248,7 +1294,7 @@
                 ]
             }
         },
-        "/v1/health": {
+        "/v1/toolgroups/{toolgroup_id}": {
             "get": {
                 "responses": {
                     "200": {
@@ -1256,69 +1302,303 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/HealthInfo"
+                                    "$ref": "#/components/schemas/ToolGroup"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Inspect"
+                    "ToolGroups"
                 ],
                 "description": "",
-                "parameters": []
-            }
-        },
-        "/v1/tool-runtime/rag-tool/insert": {
-            "post": {
+                "parameters": [
+                    {
+                        "name": "toolgroup_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            },
+            "delete": {
                 "responses": {
                     "200": {
                         "description": "OK"
                     }
                 },
                 "tags": [
-                    "ToolRuntime"
+                    "ToolGroups"
                 ],
-                "description": "Index documents so they can be used by the RAG system",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/InsertRequest"
-                            }
+                "description": "Unregister a tool group",
+                "parameters": [
+                    {
+                        "name": "toolgroup_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
                         }
-                    },
-                    "required": true
-                }
+                    }
+                ]
             }
         },
-        "/v1/vector-io/insert": {
-            "post": {
+        "/v1/telemetry/traces/{trace_id}": {
+            "get": {
                 "responses": {
                     "200": {
-                        "description": "OK"
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Trace"
+                                }
+                            }
+                        }
                     }
                 },
                 "tags": [
-                    "VectorIO"
+                    "Telemetry"
                 ],
                 "description": "",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/InsertChunksRequest"
-                            }
+                "parameters": [
+                    {
+                        "name": "trace_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
                         }
-                    },
-                    "required": true
-                }
+                    }
+                ]
             }
         },
-        "/v1/tool-runtime/invoke": {
-            "post": {
+        "/v1/post-training/job/artifacts": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/PostTrainingJobArtifactsResponse"
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "PostTraining (Coming Soon)"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "job_uuid",
+                        "in": "query",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            }
+        },
+        "/v1/post-training/job/status": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/PostTrainingJobStatusResponse"
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "PostTraining (Coming Soon)"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "job_uuid",
+                        "in": "query",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            }
+        },
+        "/v1/post-training/jobs": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ListPostTrainingJobsResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "PostTraining (Coming Soon)"
+                ],
+                "description": "",
+                "parameters": []
+            }
+        },
+        "/v1/vector-dbs/{vector_db_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/VectorDB"
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "VectorDBs"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "vector_db_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            },
+            "delete": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    }
+                },
+                "tags": [
+                    "VectorDBs"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "vector_db_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            }
+        },
+        "/v1/health": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/HealthInfo"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Inspect"
+                ],
+                "description": "",
+                "parameters": []
+            }
+        },
+        "/v1/tool-runtime/rag-tool/insert": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    }
+                },
+                "tags": [
+                    "ToolRuntime"
+                ],
+                "description": "Index documents so they can be used by the RAG system",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/InsertRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
+        "/v1/vector-io/insert": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    }
+                },
+                "tags": [
+                    "VectorIO"
+                ],
+                "description": "",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/InsertChunksRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
+        "/v1/tool-runtime/invoke": {
+            "post": {
                 "responses": {
                     "200": {
                         "description": "OK",
@@ -1348,7 +1628,7 @@
                 }
             }
         },
-        "/v1/eval/tasks/{task_id}/jobs/{job_id}": {
+        "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": {
             "get": {
                 "responses": {
                     "200": {
@@ -1375,7 +1655,7 @@
                 "description": "",
                 "parameters": [
                     {
-                        "name": "task_id",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -1404,7 +1684,7 @@
                 "description": "",
                 "parameters": [
                     {
-                        "name": "task_id",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -1422,7 +1702,7 @@
                 ]
             }
         },
-        "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": {
+        "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": {
             "get": {
                 "responses": {
                     "200": {
@@ -1442,7 +1722,7 @@
                 "description": "",
                 "parameters": [
                     {
-                        "name": "job_id",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -1450,7 +1730,7 @@
                         }
                     },
                     {
-                        "name": "task_id",
+                        "name": "job_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -1460,7 +1740,7 @@
                 ]
             }
         },
-        "/v1/datasets": {
+        "/v1/eval/benchmarks": {
             "get": {
                 "responses": {
                     "200": {
@@ -1468,14 +1748,14 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/ListDatasetsResponse"
+                                    "$ref": "#/components/schemas/ListBenchmarksResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Datasets"
+                    "Benchmarks"
                 ],
                 "description": "",
                 "parameters": []
@@ -1487,7 +1767,7 @@
                     }
                 },
                 "tags": [
-                    "Datasets"
+                    "Benchmarks"
                 ],
                 "description": "",
                 "parameters": [],
@@ -1495,7 +1775,7 @@
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/RegisterDatasetRequest"
+                                "$ref": "#/components/schemas/RegisterBenchmarkRequest"
                             }
                         }
                     },
@@ -1503,7 +1783,7 @@
                 }
             }
         },
-        "/v1/eval-tasks": {
+        "/v1/datasets": {
             "get": {
                 "responses": {
                     "200": {
@@ -1511,14 +1791,14 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/ListEvalTasksResponse"
+                                    "$ref": "#/components/schemas/ListDatasetsResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "EvalTasks"
+                    "Datasets"
                 ],
                 "description": "",
                 "parameters": []
@@ -1530,7 +1810,7 @@
                     }
                 },
                 "tags": [
-                    "EvalTasks"
+                    "Datasets"
                 ],
                 "description": "",
                 "parameters": [],
@@ -1538,7 +1818,7 @@
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/RegisterEvalTaskRequest"
+                                "$ref": "#/components/schemas/RegisterDatasetRequest"
                             }
                         }
                     },
@@ -2121,7 +2401,7 @@
                 ]
             }
         },
-        "/v1/eval/tasks/{task_id}/jobs": {
+        "/v1/eval/benchmarks/{benchmark_id}/jobs": {
             "post": {
                 "responses": {
                     "200": {
@@ -2141,7 +2421,7 @@
                 "description": "",
                 "parameters": [
                     {
-                        "name": "task_id",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -2365,84 +2645,216 @@
     "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema",
     "components": {
         "schemas": {
-            "AppendRowsRequest": {
+            "AgentCandidate": {
                 "type": "object",
                 "properties": {
-                    "dataset_id": {
-                        "type": "string"
+                    "type": {
+                        "type": "string",
+                        "const": "agent",
+                        "default": "agent"
                     },
-                    "rows": {
-                        "type": "array",
-                        "items": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "null"
-                                    },
-                                    {
-                                        "type": "boolean"
-                                    },
-                                    {
-                                        "type": "number"
-                                    },
-                                    {
-                                        "type": "string"
-                                    },
-                                    {
-                                        "type": "array"
-                                    },
-                                    {
-                                        "type": "object"
-                                    }
-                                ]
-                            }
-                        }
+                    "config": {
+                        "$ref": "#/components/schemas/AgentConfig"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "dataset_id",
-                    "rows"
+                    "type",
+                    "config"
                 ]
             },
-            "CompletionMessage": {
+            "AgentConfig": {
                 "type": "object",
                 "properties": {
-                    "role": {
-                        "type": "string",
-                        "const": "assistant",
-                        "default": "assistant",
-                        "description": "Must be \"assistant\" to identify this as the model's response"
+                    "sampling_params": {
+                        "$ref": "#/components/schemas/SamplingParams"
                     },
-                    "content": {
-                        "$ref": "#/components/schemas/InterleavedContent",
-                        "description": "The content of the model's response"
+                    "input_shields": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
                     },
-                    "stop_reason": {
+                    "output_shields": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    },
+                    "toolgroups": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/AgentTool"
+                        }
+                    },
+                    "client_tools": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/ToolDef"
+                        }
+                    },
+                    "tool_choice": {
                         "type": "string",
                         "enum": [
-                            "end_of_turn",
-                            "end_of_message",
-                            "out_of_tokens"
+                            "auto",
+                            "required"
                         ],
-                        "description": "Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: The model finished generating the entire response. - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response. - `StopReason.out_of_tokens`: The model ran out of token budget."
+                        "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
                     },
-                    "tool_calls": {
+                    "tool_prompt_format": {
+                        "type": "string",
+                        "enum": [
+                            "json",
+                            "function_tag",
+                            "python_list"
+                        ],
+                        "description": "Prompt format for calling custom / zero shot tools."
+                    },
+                    "tool_config": {
+                        "$ref": "#/components/schemas/ToolConfig"
+                    },
+                    "max_infer_iters": {
+                        "type": "integer",
+                        "default": 10
+                    },
+                    "model": {
+                        "type": "string"
+                    },
+                    "instructions": {
+                        "type": "string"
+                    },
+                    "enable_session_persistence": {
+                        "type": "boolean"
+                    },
+                    "response_format": {
+                        "$ref": "#/components/schemas/ResponseFormat"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "model",
+                    "instructions",
+                    "enable_session_persistence"
+                ]
+            },
+            "AgentTool": {
+                "oneOf": [
+                    {
+                        "type": "string"
+                    },
+                    {
+                        "type": "object",
+                        "properties": {
+                            "name": {
+                                "type": "string"
+                            },
+                            "args": {
+                                "type": "object",
+                                "additionalProperties": {
+                                    "oneOf": [
+                                        {
+                                            "type": "null"
+                                        },
+                                        {
+                                            "type": "boolean"
+                                        },
+                                        {
+                                            "type": "number"
+                                        },
+                                        {
+                                            "type": "string"
+                                        },
+                                        {
+                                            "type": "array"
+                                        },
+                                        {
+                                            "type": "object"
+                                        }
+                                    ]
+                                }
+                            }
+                        },
+                        "additionalProperties": false,
+                        "required": [
+                            "name",
+                            "args"
+                        ]
+                    }
+                ]
+            },
+            "AggregationFunctionType": {
+                "type": "string",
+                "enum": [
+                    "average",
+                    "median",
+                    "categorical_count",
+                    "accuracy"
+                ]
+            },
+            "BasicScoringFnParams": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "basic",
+                        "default": "basic"
+                    },
+                    "aggregation_functions": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/ToolCall"
-                        },
-                        "description": "List of tool calls. Each tool call is a ToolCall object."
+                            "$ref": "#/components/schemas/AggregationFunctionType"
+                        }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "role",
-                    "content",
-                    "stop_reason"
+                    "type"
+                ]
+            },
+            "BenchmarkConfig": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "benchmark",
+                        "default": "benchmark"
+                    },
+                    "eval_candidate": {
+                        "$ref": "#/components/schemas/EvalCandidate"
+                    },
+                    "scoring_params": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/ScoringFnParams"
+                        }
+                    },
+                    "num_examples": {
+                        "type": "integer"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "eval_candidate",
+                    "scoring_params"
+                ]
+            },
+            "EvalCandidate": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/ModelCandidate"
+                    },
+                    {
+                        "$ref": "#/components/schemas/AgentCandidate"
+                    }
                 ],
-                "description": "A message containing the model's (assistant) response in a chat conversation."
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "model": "#/components/schemas/ModelCandidate",
+                        "agent": "#/components/schemas/AgentCandidate"
+                    }
+                }
             },
             "GrammarResponseFormat": {
                 "type": "object",
@@ -2610,30 +3022,89 @@
                 ],
                 "description": "Configuration for JSON schema-guided response generation."
             },
-            "Message": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/UserMessage"
+            "LLMAsJudgeScoringFnParams": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "llm_as_judge",
+                        "default": "llm_as_judge"
                     },
-                    {
-                        "$ref": "#/components/schemas/SystemMessage"
+                    "judge_model": {
+                        "type": "string"
                     },
-                    {
-                        "$ref": "#/components/schemas/ToolResponseMessage"
+                    "prompt_template": {
+                        "type": "string"
                     },
-                    {
-                        "$ref": "#/components/schemas/CompletionMessage"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "role",
-                    "mapping": {
-                        "user": "#/components/schemas/UserMessage",
-                        "system": "#/components/schemas/SystemMessage",
-                        "tool": "#/components/schemas/ToolResponseMessage",
-                        "assistant": "#/components/schemas/CompletionMessage"
-                    }
-                }
+                    "judge_score_regexes": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    },
+                    "aggregation_functions": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/AggregationFunctionType"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "judge_model"
+                ]
+            },
+            "ModelCandidate": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "model",
+                        "default": "model"
+                    },
+                    "model": {
+                        "type": "string"
+                    },
+                    "sampling_params": {
+                        "$ref": "#/components/schemas/SamplingParams"
+                    },
+                    "system_message": {
+                        "$ref": "#/components/schemas/SystemMessage"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "model",
+                    "sampling_params"
+                ]
+            },
+            "RegexParserScoringFnParams": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "regex_parser",
+                        "default": "regex_parser"
+                    },
+                    "parsing_regexes": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    },
+                    "aggregation_functions": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/AggregationFunctionType"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ]
             },
             "ResponseFormat": {
                 "oneOf": [
@@ -2693,6 +3164,27 @@
                     }
                 }
             },
+            "ScoringFnParams": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/LLMAsJudgeScoringFnParams"
+                    },
+                    {
+                        "$ref": "#/components/schemas/RegexParserScoringFnParams"
+                    },
+                    {
+                        "$ref": "#/components/schemas/BasicScoringFnParams"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams",
+                        "regex_parser": "#/components/schemas/RegexParserScoringFnParams",
+                        "basic": "#/components/schemas/BasicScoringFnParams"
+                    }
+                }
+            },
             "SystemMessage": {
                 "type": "object",
                 "properties": {
@@ -2735,90 +3227,79 @@
                 ],
                 "description": "A text content item"
             },
-            "ToolCall": {
+            "ToolConfig": {
                 "type": "object",
                 "properties": {
-                    "call_id": {
+                    "tool_choice": {
+                        "type": "string",
+                        "enum": [
+                            "auto",
+                            "required"
+                        ],
+                        "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.",
+                        "default": "auto"
+                    },
+                    "tool_prompt_format": {
+                        "type": "string",
+                        "enum": [
+                            "json",
+                            "function_tag",
+                            "python_list"
+                        ],
+                        "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls."
+                    },
+                    "system_message_behavior": {
+                        "type": "string",
+                        "enum": [
+                            "append",
+                            "replace"
+                        ],
+                        "description": "(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string '{{function_definitions}}' to indicate where the function definitions should be inserted.",
+                        "default": "append"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "system_message_behavior"
+                ],
+                "description": "Configuration for tool use."
+            },
+            "ToolDef": {
+                "type": "object",
+                "properties": {
+                    "name": {
                         "type": "string"
                     },
-                    "tool_name": {
-                        "oneOf": [
-                            {
-                                "type": "string",
-                                "enum": [
-                                    "brave_search",
-                                    "wolfram_alpha",
-                                    "photogen",
-                                    "code_interpreter"
-                                ]
-                            },
-                            {
-                                "type": "string"
-                            }
-                        ]
+                    "description": {
+                        "type": "string"
                     },
-                    "arguments": {
+                    "parameters": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/ToolParameter"
+                        }
+                    },
+                    "metadata": {
                         "type": "object",
                         "additionalProperties": {
                             "oneOf": [
                                 {
-                                    "type": "string"
+                                    "type": "null"
                                 },
                                 {
-                                    "type": "integer"
+                                    "type": "boolean"
                                 },
                                 {
                                     "type": "number"
                                 },
                                 {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "null"
+                                    "type": "string"
                                 },
                                 {
-                                    "type": "array",
-                                    "items": {
-                                        "oneOf": [
-                                            {
-                                                "type": "string"
-                                            },
-                                            {
-                                                "type": "integer"
-                                            },
-                                            {
-                                                "type": "number"
-                                            },
-                                            {
-                                                "type": "boolean"
-                                            },
-                                            {
-                                                "type": "null"
-                                            }
-                                        ]
-                                    }
+                                    "type": "array"
                                 },
                                 {
-                                    "type": "object",
-                                    "additionalProperties": {
-                                        "oneOf": [
-                                            {
-                                                "type": "string"
-                                            },
-                                            {
-                                                "type": "integer"
-                                            },
-                                            {
-                                                "type": "number"
-                                            },
-                                            {
-                                                "type": "boolean"
-                                            },
-                                            {
-                                                "type": "null"
-                                            }
-                                        ]
-                                    }
+                                    "type": "object"
                                 }
                             ]
                         }
@@ -2826,49 +3307,16 @@
                 },
                 "additionalProperties": false,
                 "required": [
-                    "call_id",
-                    "tool_name",
-                    "arguments"
+                    "name"
                 ]
             },
-            "ToolDefinition": {
+            "ToolParameter": {
                 "type": "object",
                 "properties": {
-                    "tool_name": {
-                        "oneOf": [
-                            {
-                                "type": "string",
-                                "enum": [
-                                    "brave_search",
-                                    "wolfram_alpha",
-                                    "photogen",
-                                    "code_interpreter"
-                                ]
-                            },
-                            {
-                                "type": "string"
-                            }
-                        ]
-                    },
-                    "description": {
+                    "name": {
                         "type": "string"
                     },
-                    "parameters": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ToolParamDefinition"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "tool_name"
-                ]
-            },
-            "ToolParamDefinition": {
-                "type": "object",
-                "properties": {
-                    "param_type": {
+                    "parameter_type": {
                         "type": "string"
                     },
                     "description": {
@@ -2903,54 +3351,13 @@
                 },
                 "additionalProperties": false,
                 "required": [
-                    "param_type"
+                    "name",
+                    "parameter_type",
+                    "description",
+                    "required"
                 ]
             },
-            "ToolResponseMessage": {
-                "type": "object",
-                "properties": {
-                    "role": {
-                        "type": "string",
-                        "const": "tool",
-                        "default": "tool",
-                        "description": "Must be \"tool\" to identify this as a tool response"
-                    },
-                    "call_id": {
-                        "type": "string",
-                        "description": "Unique identifier for the tool call this response is for"
-                    },
-                    "tool_name": {
-                        "oneOf": [
-                            {
-                                "type": "string",
-                                "enum": [
-                                    "brave_search",
-                                    "wolfram_alpha",
-                                    "photogen",
-                                    "code_interpreter"
-                                ]
-                            },
-                            {
-                                "type": "string"
-                            }
-                        ],
-                        "description": "Name of the tool that was called"
-                    },
-                    "content": {
-                        "$ref": "#/components/schemas/InterleavedContent",
-                        "description": "The response content from the tool"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "role",
-                    "call_id",
-                    "tool_name",
-                    "content"
-                ],
-                "description": "A message representing the result of a tool invocation."
-            },
-            "TopKSamplingStrategy": {
+            "TopKSamplingStrategy": {
                 "type": "object",
                 "properties": {
                     "type": {
@@ -3001,277 +3408,379 @@
                     "uri"
                 ]
             },
-            "UserMessage": {
-                "type": "object",
-                "properties": {
-                    "role": {
-                        "type": "string",
-                        "const": "user",
-                        "default": "user",
-                        "description": "Must be \"user\" to identify this as a user message"
-                    },
-                    "content": {
-                        "$ref": "#/components/schemas/InterleavedContent",
-                        "description": "The content of the message, which can include text and other media"
-                    },
-                    "context": {
-                        "$ref": "#/components/schemas/InterleavedContent",
-                        "description": "(Optional) This field is used internally by Llama Stack to pass RAG context. This field may be removed in the API in the future."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "role",
-                    "content"
-                ],
-                "description": "A message from the user in a chat conversation."
-            },
-            "BatchChatCompletionRequest": {
+            "DeprecatedEvaluateRowsRequest": {
                 "type": "object",
                 "properties": {
-                    "model": {
-                        "type": "string"
-                    },
-                    "messages_batch": {
+                    "input_rows": {
                         "type": "array",
                         "items": {
-                            "type": "array",
-                            "items": {
-                                "$ref": "#/components/schemas/Message"
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
                             }
                         }
                     },
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams"
-                    },
-                    "tools": {
+                    "scoring_functions": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/ToolDefinition"
+                            "type": "string"
                         }
                     },
-                    "tool_choice": {
-                        "type": "string",
-                        "enum": [
-                            "auto",
-                            "required"
-                        ],
-                        "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
-                    },
-                    "tool_prompt_format": {
-                        "type": "string",
-                        "enum": [
-                            "json",
-                            "function_tag",
-                            "python_list"
-                        ],
-                        "description": "Prompt format for calling custom / zero shot tools."
-                    },
-                    "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat"
-                    },
-                    "logprobs": {
-                        "type": "object",
-                        "properties": {
-                            "top_k": {
-                                "type": "integer",
-                                "default": 0,
-                                "description": "How many tokens (for each position) to return log probabilities for."
-                            }
-                        },
-                        "additionalProperties": false
+                    "task_config": {
+                        "$ref": "#/components/schemas/BenchmarkConfig"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "model",
-                    "messages_batch"
+                    "input_rows",
+                    "scoring_functions",
+                    "task_config"
                 ]
             },
-            "BatchChatCompletionResponse": {
+            "EvaluateResponse": {
                 "type": "object",
                 "properties": {
-                    "batch": {
+                    "generations": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/ChatCompletionResponse"
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
+                            }
+                        }
+                    },
+                    "scores": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/ScoringResult"
                         }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "batch"
+                    "generations",
+                    "scores"
                 ]
             },
-            "ChatCompletionResponse": {
+            "ScoringResult": {
                 "type": "object",
                 "properties": {
-                    "metrics": {
+                    "score_rows": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/MetricEvent"
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
+                            }
                         }
                     },
-                    "completion_message": {
-                        "$ref": "#/components/schemas/CompletionMessage",
-                        "description": "The complete response message"
-                    },
-                    "logprobs": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/TokenLogProbs"
-                        },
-                        "description": "Optional log probabilities for generated tokens"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "completion_message"
-                ],
-                "description": "Response from a chat completion request."
-            },
-            "MetricEvent": {
-                "type": "object",
-                "properties": {
-                    "trace_id": {
-                        "type": "string"
-                    },
-                    "span_id": {
-                        "type": "string"
-                    },
-                    "timestamp": {
-                        "type": "string",
-                        "format": "date-time"
-                    },
-                    "attributes": {
+                    "aggregated_results": {
                         "type": "object",
                         "additionalProperties": {
                             "oneOf": [
                                 {
-                                    "type": "string"
+                                    "type": "null"
                                 },
                                 {
-                                    "type": "integer"
+                                    "type": "boolean"
                                 },
                                 {
                                     "type": "number"
                                 },
                                 {
-                                    "type": "boolean"
+                                    "type": "string"
                                 },
                                 {
-                                    "type": "null"
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
                                 }
                             ]
                         }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "score_rows",
+                    "aggregated_results"
+                ]
+            },
+            "Benchmark": {
+                "type": "object",
+                "properties": {
+                    "identifier": {
+                        "type": "string"
                     },
-                    "type": {
-                        "type": "string",
-                        "const": "metric",
-                        "default": "metric"
-                    },
-                    "metric": {
+                    "provider_resource_id": {
                         "type": "string"
                     },
-                    "value": {
-                        "oneOf": [
-                            {
-                                "type": "integer"
-                            },
-                            {
-                                "type": "number"
-                            }
-                        ]
+                    "provider_id": {
+                        "type": "string"
                     },
-                    "unit": {
+                    "type": {
+                        "type": "string",
+                        "const": "benchmark",
+                        "default": "benchmark"
+                    },
+                    "dataset_id": {
                         "type": "string"
+                    },
+                    "scoring_functions": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "trace_id",
-                    "span_id",
-                    "timestamp",
+                    "identifier",
+                    "provider_resource_id",
+                    "provider_id",
                     "type",
-                    "metric",
-                    "value",
-                    "unit"
+                    "dataset_id",
+                    "scoring_functions",
+                    "metadata"
                 ]
             },
-            "TokenLogProbs": {
+            "JobStatus": {
+                "type": "string",
+                "enum": [
+                    "completed",
+                    "in_progress",
+                    "failed",
+                    "scheduled"
+                ]
+            },
+            "ListBenchmarksResponse": {
                 "type": "object",
                 "properties": {
-                    "logprobs_by_token": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "type": "number"
-                        },
-                        "description": "Dictionary mapping tokens to their log probabilities"
+                    "data": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/Benchmark"
+                        }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "logprobs_by_token"
-                ],
-                "description": "Log probabilities for generated tokens."
+                    "data"
+                ]
             },
-            "BatchCompletionRequest": {
+            "DeprecatedRegisterEvalTaskRequest": {
                 "type": "object",
                 "properties": {
-                    "model": {
+                    "eval_task_id": {
                         "type": "string"
                     },
-                    "content_batch": {
+                    "dataset_id": {
+                        "type": "string"
+                    },
+                    "scoring_functions": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/InterleavedContent"
+                            "type": "string"
                         }
                     },
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams"
+                    "provider_benchmark_id": {
+                        "type": "string"
                     },
-                    "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat"
+                    "provider_id": {
+                        "type": "string"
                     },
-                    "logprobs": {
+                    "metadata": {
                         "type": "object",
-                        "properties": {
-                            "top_k": {
-                                "type": "integer",
-                                "default": 0,
-                                "description": "How many tokens (for each position) to return log probabilities for."
-                            }
-                        },
-                        "additionalProperties": false
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "model",
-                    "content_batch"
+                    "eval_task_id",
+                    "dataset_id",
+                    "scoring_functions"
                 ]
             },
-            "BatchCompletionResponse": {
+            "DeprecatedRunEvalRequest": {
                 "type": "object",
                 "properties": {
-                    "batch": {
+                    "task_config": {
+                        "$ref": "#/components/schemas/BenchmarkConfig"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "task_config"
+                ]
+            },
+            "Job": {
+                "type": "object",
+                "properties": {
+                    "job_id": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "job_id"
+                ]
+            },
+            "AppendRowsRequest": {
+                "type": "object",
+                "properties": {
+                    "dataset_id": {
+                        "type": "string"
+                    },
+                    "rows": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/CompletionResponse"
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
+                            }
                         }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "batch"
+                    "dataset_id",
+                    "rows"
                 ]
             },
-            "CompletionResponse": {
+            "CompletionMessage": {
                 "type": "object",
                 "properties": {
-                    "content": {
+                    "role": {
                         "type": "string",
-                        "description": "The generated completion text"
+                        "const": "assistant",
+                        "default": "assistant",
+                        "description": "Must be \"assistant\" to identify this as the model's response"
+                    },
+                    "content": {
+                        "$ref": "#/components/schemas/InterleavedContent",
+                        "description": "The content of the model's response"
                     },
                     "stop_reason": {
                         "type": "string",
@@ -3280,398 +3789,311 @@
                             "end_of_message",
                             "out_of_tokens"
                         ],
-                        "description": "Reason why generation stopped"
+                        "description": "Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: The model finished generating the entire response. - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response. - `StopReason.out_of_tokens`: The model ran out of token budget."
                     },
-                    "logprobs": {
+                    "tool_calls": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/TokenLogProbs"
+                            "$ref": "#/components/schemas/ToolCall"
                         },
-                        "description": "Optional log probabilities for generated tokens"
+                        "description": "List of tool calls. Each tool call is a ToolCall object."
                     }
                 },
                 "additionalProperties": false,
                 "required": [
+                    "role",
                     "content",
                     "stop_reason"
                 ],
-                "description": "Response from a completion request."
+                "description": "A message containing the model's (assistant) response in a chat conversation."
             },
-            "CancelTrainingJobRequest": {
-                "type": "object",
-                "properties": {
-                    "job_uuid": {
-                        "type": "string"
+            "Message": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/UserMessage"
+                    },
+                    {
+                        "$ref": "#/components/schemas/SystemMessage"
+                    },
+                    {
+                        "$ref": "#/components/schemas/ToolResponseMessage"
+                    },
+                    {
+                        "$ref": "#/components/schemas/CompletionMessage"
                     }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "job_uuid"
-                ]
-            },
-            "ToolConfig": {
-                "type": "object",
-                "properties": {
-                    "tool_choice": {
-                        "type": "string",
-                        "enum": [
-                            "auto",
-                            "required"
-                        ],
-                        "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.",
-                        "default": "auto"
-                    },
-                    "tool_prompt_format": {
-                        "type": "string",
-                        "enum": [
-                            "json",
-                            "function_tag",
-                            "python_list"
-                        ],
-                        "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls."
-                    },
-                    "system_message_behavior": {
-                        "type": "string",
-                        "enum": [
-                            "append",
-                            "replace"
-                        ],
-                        "description": "(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string '{{function_definitions}}' to indicate where the function definitions should be inserted.",
-                        "default": "append"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "system_message_behavior"
                 ],
-                "description": "Configuration for tool use."
+                "discriminator": {
+                    "propertyName": "role",
+                    "mapping": {
+                        "user": "#/components/schemas/UserMessage",
+                        "system": "#/components/schemas/SystemMessage",
+                        "tool": "#/components/schemas/ToolResponseMessage",
+                        "assistant": "#/components/schemas/CompletionMessage"
+                    }
+                }
             },
-            "ChatCompletionRequest": {
+            "ToolCall": {
                 "type": "object",
                 "properties": {
-                    "model_id": {
-                        "type": "string",
-                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
-                    },
-                    "messages": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/Message"
-                        },
-                        "description": "List of messages in the conversation"
-                    },
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams",
-                        "description": "Parameters to control the sampling strategy"
-                    },
-                    "tools": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/ToolDefinition"
-                        },
-                        "description": "(Optional) List of tool definitions available to the model"
-                    },
-                    "tool_choice": {
-                        "type": "string",
-                        "enum": [
-                            "auto",
-                            "required"
-                        ],
-                        "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead."
-                    },
-                    "tool_prompt_format": {
-                        "type": "string",
-                        "enum": [
-                            "json",
-                            "function_tag",
-                            "python_list"
-                        ],
-                        "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls. .. deprecated:: Use tool_config instead."
-                    },
-                    "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat",
-                        "description": "(Optional) Grammar specification for guided (structured) decoding. There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it."
-                    },
-                    "stream": {
-                        "type": "boolean",
-                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
+                    "call_id": {
+                        "type": "string"
                     },
-                    "logprobs": {
-                        "type": "object",
-                        "properties": {
-                            "top_k": {
-                                "type": "integer",
-                                "default": 0,
-                                "description": "How many tokens (for each position) to return log probabilities for."
+                    "tool_name": {
+                        "oneOf": [
+                            {
+                                "type": "string",
+                                "enum": [
+                                    "brave_search",
+                                    "wolfram_alpha",
+                                    "photogen",
+                                    "code_interpreter"
+                                ]
+                            },
+                            {
+                                "type": "string"
                             }
-                        },
-                        "additionalProperties": false,
-                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
+                        ]
                     },
-                    "tool_config": {
-                        "$ref": "#/components/schemas/ToolConfig",
-                        "description": "(Optional) Configuration for tool use."
+                    "arguments": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "integer"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "array",
+                                    "items": {
+                                        "oneOf": [
+                                            {
+                                                "type": "string"
+                                            },
+                                            {
+                                                "type": "integer"
+                                            },
+                                            {
+                                                "type": "number"
+                                            },
+                                            {
+                                                "type": "boolean"
+                                            },
+                                            {
+                                                "type": "null"
+                                            }
+                                        ]
+                                    }
+                                },
+                                {
+                                    "type": "object",
+                                    "additionalProperties": {
+                                        "oneOf": [
+                                            {
+                                                "type": "string"
+                                            },
+                                            {
+                                                "type": "integer"
+                                            },
+                                            {
+                                                "type": "number"
+                                            },
+                                            {
+                                                "type": "boolean"
+                                            },
+                                            {
+                                                "type": "null"
+                                            }
+                                        ]
+                                    }
+                                }
+                            ]
+                        }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "model_id",
-                    "messages"
+                    "call_id",
+                    "tool_name",
+                    "arguments"
                 ]
             },
-            "ChatCompletionResponseEvent": {
+            "ToolDefinition": {
                 "type": "object",
                 "properties": {
-                    "event_type": {
-                        "type": "string",
-                        "enum": [
-                            "start",
-                            "complete",
-                            "progress"
-                        ],
-                        "description": "Type of the event"
-                    },
-                    "delta": {
-                        "$ref": "#/components/schemas/ContentDelta",
-                        "description": "Content generated since last event. This can be one or more tokens, or a tool call."
+                    "tool_name": {
+                        "oneOf": [
+                            {
+                                "type": "string",
+                                "enum": [
+                                    "brave_search",
+                                    "wolfram_alpha",
+                                    "photogen",
+                                    "code_interpreter"
+                                ]
+                            },
+                            {
+                                "type": "string"
+                            }
+                        ]
                     },
-                    "logprobs": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/TokenLogProbs"
-                        },
-                        "description": "Optional log probabilities for generated tokens"
+                    "description": {
+                        "type": "string"
                     },
-                    "stop_reason": {
-                        "type": "string",
-                        "enum": [
-                            "end_of_turn",
-                            "end_of_message",
-                            "out_of_tokens"
-                        ],
-                        "description": "Optional reason why generation stopped, if complete"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "event_type",
-                    "delta"
-                ],
-                "description": "An event during chat completion generation."
-            },
-            "ChatCompletionResponseStreamChunk": {
-                "type": "object",
-                "properties": {
-                    "metrics": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/MetricEvent"
+                    "parameters": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/ToolParamDefinition"
                         }
-                    },
-                    "event": {
-                        "$ref": "#/components/schemas/ChatCompletionResponseEvent",
-                        "description": "The event containing the new content"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "event"
-                ],
-                "description": "A chunk of a streamed chat completion response."
-            },
-            "ContentDelta": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/TextDelta"
-                    },
-                    {
-                        "$ref": "#/components/schemas/ImageDelta"
-                    },
-                    {
-                        "$ref": "#/components/schemas/ToolCallDelta"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "text": "#/components/schemas/TextDelta",
-                        "image": "#/components/schemas/ImageDelta",
-                        "tool_call": "#/components/schemas/ToolCallDelta"
-                    }
-                }
-            },
-            "ImageDelta": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "image",
-                        "default": "image"
-                    },
-                    "image": {
-                        "type": "string",
-                        "contentEncoding": "base64"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "image"
+                    "tool_name"
                 ]
             },
-            "TextDelta": {
+            "ToolParamDefinition": {
                 "type": "object",
                 "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "text",
-                        "default": "text"
+                    "param_type": {
+                        "type": "string"
                     },
-                    "text": {
+                    "description": {
                         "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "text"
-                ]
-            },
-            "ToolCallDelta": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "tool_call",
-                        "default": "tool_call"
                     },
-                    "tool_call": {
+                    "required": {
+                        "type": "boolean",
+                        "default": true
+                    },
+                    "default": {
                         "oneOf": [
+                            {
+                                "type": "null"
+                            },
+                            {
+                                "type": "boolean"
+                            },
+                            {
+                                "type": "number"
+                            },
                             {
                                 "type": "string"
                             },
                             {
-                                "$ref": "#/components/schemas/ToolCall"
+                                "type": "array"
+                            },
+                            {
+                                "type": "object"
                             }
                         ]
-                    },
-                    "parse_status": {
-                        "type": "string",
-                        "enum": [
-                            "started",
-                            "in_progress",
-                            "failed",
-                            "succeeded"
-                        ]
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "type",
-                    "tool_call",
-                    "parse_status"
+                    "param_type"
                 ]
             },
-            "CompletionRequest": {
+            "ToolResponseMessage": {
                 "type": "object",
                 "properties": {
-                    "model_id": {
+                    "role": {
                         "type": "string",
-                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
-                    },
-                    "content": {
-                        "$ref": "#/components/schemas/InterleavedContent",
-                        "description": "The content to generate a completion for"
-                    },
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams",
-                        "description": "(Optional) Parameters to control the sampling strategy"
-                    },
-                    "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat",
-                        "description": "(Optional) Grammar specification for guided (structured) decoding"
+                        "const": "tool",
+                        "default": "tool",
+                        "description": "Must be \"tool\" to identify this as a tool response"
                     },
-                    "stream": {
-                        "type": "boolean",
-                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
+                    "call_id": {
+                        "type": "string",
+                        "description": "Unique identifier for the tool call this response is for"
                     },
-                    "logprobs": {
-                        "type": "object",
-                        "properties": {
-                            "top_k": {
-                                "type": "integer",
-                                "default": 0,
-                                "description": "How many tokens (for each position) to return log probabilities for."
+                    "tool_name": {
+                        "oneOf": [
+                            {
+                                "type": "string",
+                                "enum": [
+                                    "brave_search",
+                                    "wolfram_alpha",
+                                    "photogen",
+                                    "code_interpreter"
+                                ]
+                            },
+                            {
+                                "type": "string"
                             }
-                        },
-                        "additionalProperties": false,
-                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
+                        ],
+                        "description": "Name of the tool that was called"
+                    },
+                    "content": {
+                        "$ref": "#/components/schemas/InterleavedContent",
+                        "description": "The response content from the tool"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "model_id",
+                    "role",
+                    "call_id",
+                    "tool_name",
                     "content"
-                ]
+                ],
+                "description": "A message representing the result of a tool invocation."
             },
-            "CompletionResponseStreamChunk": {
+            "UserMessage": {
                 "type": "object",
                 "properties": {
-                    "delta": {
+                    "role": {
                         "type": "string",
-                        "description": "New content generated since last chunk. This can be one or more tokens."
+                        "const": "user",
+                        "default": "user",
+                        "description": "Must be \"user\" to identify this as a user message"
                     },
-                    "stop_reason": {
-                        "type": "string",
-                        "enum": [
-                            "end_of_turn",
-                            "end_of_message",
-                            "out_of_tokens"
-                        ],
-                        "description": "Optional reason why generation stopped, if complete"
+                    "content": {
+                        "$ref": "#/components/schemas/InterleavedContent",
+                        "description": "The content of the message, which can include text and other media"
                     },
-                    "logprobs": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/TokenLogProbs"
-                        },
-                        "description": "Optional log probabilities for generated tokens"
+                    "context": {
+                        "$ref": "#/components/schemas/InterleavedContent",
+                        "description": "(Optional) This field is used internally by Llama Stack to pass RAG context. This field may be removed in the API in the future."
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "delta"
+                    "role",
+                    "content"
                 ],
-                "description": "A chunk of a streamed completion response."
+                "description": "A message from the user in a chat conversation."
             },
-            "AgentConfig": {
+            "BatchChatCompletionRequest": {
                 "type": "object",
                 "properties": {
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams"
-                    },
-                    "input_shields": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        }
+                    "model": {
+                        "type": "string"
                     },
-                    "output_shields": {
+                    "messages_batch": {
                         "type": "array",
                         "items": {
-                            "type": "string"
+                            "type": "array",
+                            "items": {
+                                "$ref": "#/components/schemas/Message"
+                            }
                         }
                     },
-                    "toolgroups": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/AgentTool"
-                        }
+                    "sampling_params": {
+                        "$ref": "#/components/schemas/SamplingParams"
                     },
-                    "client_tools": {
+                    "tools": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/ToolDef"
+                            "$ref": "#/components/schemas/ToolDefinition"
                         }
                     },
                     "tool_choice": {
@@ -3691,565 +4113,614 @@
                         ],
                         "description": "Prompt format for calling custom / zero shot tools."
                     },
-                    "tool_config": {
-                        "$ref": "#/components/schemas/ToolConfig"
-                    },
-                    "max_infer_iters": {
-                        "type": "integer",
-                        "default": 10
-                    },
-                    "model": {
-                        "type": "string"
-                    },
-                    "instructions": {
-                        "type": "string"
-                    },
-                    "enable_session_persistence": {
-                        "type": "boolean"
-                    },
                     "response_format": {
                         "$ref": "#/components/schemas/ResponseFormat"
+                    },
+                    "logprobs": {
+                        "type": "object",
+                        "properties": {
+                            "top_k": {
+                                "type": "integer",
+                                "default": 0,
+                                "description": "How many tokens (for each position) to return log probabilities for."
+                            }
+                        },
+                        "additionalProperties": false
                     }
                 },
                 "additionalProperties": false,
                 "required": [
                     "model",
-                    "instructions",
-                    "enable_session_persistence"
+                    "messages_batch"
                 ]
             },
-            "AgentTool": {
-                "oneOf": [
-                    {
-                        "type": "string"
+            "BatchChatCompletionResponse": {
+                "type": "object",
+                "properties": {
+                    "batch": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/ChatCompletionResponse"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "batch"
+                ]
+            },
+            "ChatCompletionResponse": {
+                "type": "object",
+                "properties": {
+                    "metrics": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/MetricEvent"
+                        }
                     },
-                    {
-                        "type": "object",
-                        "properties": {
-                            "name": {
-                                "type": "string"
-                            },
-                            "args": {
-                                "type": "object",
-                                "additionalProperties": {
-                                    "oneOf": [
-                                        {
-                                            "type": "null"
-                                        },
-                                        {
-                                            "type": "boolean"
-                                        },
-                                        {
-                                            "type": "number"
-                                        },
-                                        {
-                                            "type": "string"
-                                        },
-                                        {
-                                            "type": "array"
-                                        },
-                                        {
-                                            "type": "object"
-                                        }
-                                    ]
-                                }
-                            }
+                    "completion_message": {
+                        "$ref": "#/components/schemas/CompletionMessage",
+                        "description": "The complete response message"
+                    },
+                    "logprobs": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/TokenLogProbs"
                         },
-                        "additionalProperties": false,
-                        "required": [
-                            "name",
-                            "args"
-                        ]
+                        "description": "Optional log probabilities for generated tokens"
                     }
-                ]
+                },
+                "additionalProperties": false,
+                "required": [
+                    "completion_message"
+                ],
+                "description": "Response from a chat completion request."
             },
-            "ToolDef": {
+            "MetricEvent": {
                 "type": "object",
                 "properties": {
-                    "name": {
+                    "trace_id": {
                         "type": "string"
                     },
-                    "description": {
+                    "span_id": {
                         "type": "string"
                     },
-                    "parameters": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/ToolParameter"
-                        }
+                    "timestamp": {
+                        "type": "string",
+                        "format": "date-time"
                     },
-                    "metadata": {
+                    "attributes": {
                         "type": "object",
                         "additionalProperties": {
                             "oneOf": [
                                 {
-                                    "type": "null"
+                                    "type": "string"
                                 },
                                 {
-                                    "type": "boolean"
+                                    "type": "integer"
                                 },
                                 {
                                     "type": "number"
                                 },
                                 {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
+                                    "type": "boolean"
                                 },
                                 {
-                                    "type": "object"
+                                    "type": "null"
                                 }
                             ]
                         }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "name"
-                ]
-            },
-            "ToolParameter": {
-                "type": "object",
-                "properties": {
-                    "name": {
-                        "type": "string"
                     },
-                    "parameter_type": {
-                        "type": "string"
+                    "type": {
+                        "type": "string",
+                        "const": "metric",
+                        "default": "metric"
                     },
-                    "description": {
+                    "metric": {
                         "type": "string"
                     },
-                    "required": {
-                        "type": "boolean",
-                        "default": true
-                    },
-                    "default": {
+                    "value": {
                         "oneOf": [
                             {
-                                "type": "null"
-                            },
-                            {
-                                "type": "boolean"
+                                "type": "integer"
                             },
                             {
                                 "type": "number"
-                            },
-                            {
-                                "type": "string"
-                            },
-                            {
-                                "type": "array"
-                            },
-                            {
-                                "type": "object"
                             }
                         ]
+                    },
+                    "unit": {
+                        "type": "string"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "name",
-                    "parameter_type",
-                    "description",
-                    "required"
+                    "trace_id",
+                    "span_id",
+                    "timestamp",
+                    "type",
+                    "metric",
+                    "value",
+                    "unit"
                 ]
             },
-            "CreateAgentRequest": {
+            "TokenLogProbs": {
                 "type": "object",
                 "properties": {
-                    "agent_config": {
-                        "$ref": "#/components/schemas/AgentConfig"
+                    "logprobs_by_token": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "type": "number"
+                        },
+                        "description": "Dictionary mapping tokens to their log probabilities"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "agent_config"
-                ]
+                    "logprobs_by_token"
+                ],
+                "description": "Log probabilities for generated tokens."
             },
-            "AgentCreateResponse": {
+            "BatchCompletionRequest": {
                 "type": "object",
                 "properties": {
-                    "agent_id": {
+                    "model": {
                         "type": "string"
+                    },
+                    "content_batch": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/InterleavedContent"
+                        }
+                    },
+                    "sampling_params": {
+                        "$ref": "#/components/schemas/SamplingParams"
+                    },
+                    "response_format": {
+                        "$ref": "#/components/schemas/ResponseFormat"
+                    },
+                    "logprobs": {
+                        "type": "object",
+                        "properties": {
+                            "top_k": {
+                                "type": "integer",
+                                "default": 0,
+                                "description": "How many tokens (for each position) to return log probabilities for."
+                            }
+                        },
+                        "additionalProperties": false
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "agent_id"
+                    "model",
+                    "content_batch"
                 ]
             },
-            "CreateAgentSessionRequest": {
+            "BatchCompletionResponse": {
+                "type": "object",
+                "properties": {
+                    "batch": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/CompletionResponse"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "batch"
+                ]
+            },
+            "CompletionResponse": {
                 "type": "object",
                 "properties": {
-                    "session_name": {
-                        "type": "string"
+                    "content": {
+                        "type": "string",
+                        "description": "The generated completion text"
+                    },
+                    "stop_reason": {
+                        "type": "string",
+                        "enum": [
+                            "end_of_turn",
+                            "end_of_message",
+                            "out_of_tokens"
+                        ],
+                        "description": "Reason why generation stopped"
+                    },
+                    "logprobs": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/TokenLogProbs"
+                        },
+                        "description": "Optional log probabilities for generated tokens"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "session_name"
-                ]
+                    "content",
+                    "stop_reason"
+                ],
+                "description": "Response from a completion request."
             },
-            "AgentSessionCreateResponse": {
+            "CancelTrainingJobRequest": {
                 "type": "object",
                 "properties": {
-                    "session_id": {
+                    "job_uuid": {
                         "type": "string"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "session_id"
+                    "job_uuid"
                 ]
             },
-            "CreateAgentTurnRequest": {
+            "ChatCompletionRequest": {
                 "type": "object",
                 "properties": {
+                    "model_id": {
+                        "type": "string",
+                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
+                    },
                     "messages": {
                         "type": "array",
                         "items": {
-                            "oneOf": [
-                                {
-                                    "$ref": "#/components/schemas/UserMessage"
-                                },
-                                {
-                                    "$ref": "#/components/schemas/ToolResponseMessage"
-                                }
-                            ]
-                        }
+                            "$ref": "#/components/schemas/Message"
+                        },
+                        "description": "List of messages in the conversation"
                     },
-                    "stream": {
-                        "type": "boolean"
+                    "sampling_params": {
+                        "$ref": "#/components/schemas/SamplingParams",
+                        "description": "Parameters to control the sampling strategy"
                     },
-                    "documents": {
+                    "tools": {
                         "type": "array",
                         "items": {
-                            "type": "object",
-                            "properties": {
-                                "content": {
-                                    "oneOf": [
-                                        {
-                                            "type": "string"
-                                        },
-                                        {
-                                            "$ref": "#/components/schemas/InterleavedContentItem"
-                                        },
-                                        {
-                                            "type": "array",
-                                            "items": {
-                                                "$ref": "#/components/schemas/InterleavedContentItem"
-                                            }
-                                        },
-                                        {
-                                            "$ref": "#/components/schemas/URL"
-                                        }
-                                    ]
-                                },
-                                "mime_type": {
-                                    "type": "string"
-                                }
-                            },
-                            "additionalProperties": false,
-                            "required": [
-                                "content",
-                                "mime_type"
-                            ]
-                        }
+                            "$ref": "#/components/schemas/ToolDefinition"
+                        },
+                        "description": "(Optional) List of tool definitions available to the model"
                     },
-                    "toolgroups": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/AgentTool"
-                        }
+                    "tool_choice": {
+                        "type": "string",
+                        "enum": [
+                            "auto",
+                            "required"
+                        ],
+                        "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead."
+                    },
+                    "tool_prompt_format": {
+                        "type": "string",
+                        "enum": [
+                            "json",
+                            "function_tag",
+                            "python_list"
+                        ],
+                        "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls. .. deprecated:: Use tool_config instead."
+                    },
+                    "response_format": {
+                        "$ref": "#/components/schemas/ResponseFormat",
+                        "description": "(Optional) Grammar specification for guided (structured) decoding. There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it."
+                    },
+                    "stream": {
+                        "type": "boolean",
+                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
+                    },
+                    "logprobs": {
+                        "type": "object",
+                        "properties": {
+                            "top_k": {
+                                "type": "integer",
+                                "default": 0,
+                                "description": "How many tokens (for each position) to return log probabilities for."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
                     },
                     "tool_config": {
-                        "$ref": "#/components/schemas/ToolConfig"
+                        "$ref": "#/components/schemas/ToolConfig",
+                        "description": "(Optional) Configuration for tool use."
                     }
                 },
                 "additionalProperties": false,
                 "required": [
+                    "model_id",
                     "messages"
                 ]
             },
-            "InferenceStep": {
+            "ChatCompletionResponseEvent": {
                 "type": "object",
                 "properties": {
-                    "turn_id": {
-                        "type": "string"
-                    },
-                    "step_id": {
-                        "type": "string"
-                    },
-                    "started_at": {
+                    "event_type": {
                         "type": "string",
-                        "format": "date-time"
+                        "enum": [
+                            "start",
+                            "complete",
+                            "progress"
+                        ],
+                        "description": "Type of the event"
                     },
-                    "completed_at": {
-                        "type": "string",
-                        "format": "date-time"
+                    "delta": {
+                        "$ref": "#/components/schemas/ContentDelta",
+                        "description": "Content generated since last event. This can be one or more tokens, or a tool call."
                     },
-                    "step_type": {
-                        "type": "string",
-                        "const": "inference",
-                        "default": "inference"
+                    "logprobs": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/TokenLogProbs"
+                        },
+                        "description": "Optional log probabilities for generated tokens"
                     },
-                    "model_response": {
-                        "$ref": "#/components/schemas/CompletionMessage"
+                    "stop_reason": {
+                        "type": "string",
+                        "enum": [
+                            "end_of_turn",
+                            "end_of_message",
+                            "out_of_tokens"
+                        ],
+                        "description": "Optional reason why generation stopped, if complete"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "turn_id",
-                    "step_id",
-                    "step_type",
-                    "model_response"
-                ]
+                    "event_type",
+                    "delta"
+                ],
+                "description": "An event during chat completion generation."
             },
-            "MemoryRetrievalStep": {
+            "ChatCompletionResponseStreamChunk": {
                 "type": "object",
                 "properties": {
-                    "turn_id": {
-                        "type": "string"
+                    "metrics": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/MetricEvent"
+                        }
                     },
-                    "step_id": {
-                        "type": "string"
+                    "event": {
+                        "$ref": "#/components/schemas/ChatCompletionResponseEvent",
+                        "description": "The event containing the new content"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "event"
+                ],
+                "description": "A chunk of a streamed chat completion response."
+            },
+            "ContentDelta": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/TextDelta"
                     },
-                    "started_at": {
-                        "type": "string",
-                        "format": "date-time"
+                    {
+                        "$ref": "#/components/schemas/ImageDelta"
                     },
-                    "completed_at": {
+                    {
+                        "$ref": "#/components/schemas/ToolCallDelta"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "text": "#/components/schemas/TextDelta",
+                        "image": "#/components/schemas/ImageDelta",
+                        "tool_call": "#/components/schemas/ToolCallDelta"
+                    }
+                }
+            },
+            "ImageDelta": {
+                "type": "object",
+                "properties": {
+                    "type": {
                         "type": "string",
-                        "format": "date-time"
+                        "const": "image",
+                        "default": "image"
                     },
-                    "step_type": {
+                    "image": {
+                        "type": "string",
+                        "contentEncoding": "base64"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "image"
+                ]
+            },
+            "TextDelta": {
+                "type": "object",
+                "properties": {
+                    "type": {
                         "type": "string",
-                        "const": "memory_retrieval",
-                        "default": "memory_retrieval"
+                        "const": "text",
+                        "default": "text"
                     },
-                    "vector_db_ids": {
+                    "text": {
                         "type": "string"
-                    },
-                    "inserted_context": {
-                        "$ref": "#/components/schemas/InterleavedContent"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "turn_id",
-                    "step_id",
-                    "step_type",
-                    "vector_db_ids",
-                    "inserted_context"
+                    "type",
+                    "text"
                 ]
             },
-            "SafetyViolation": {
+            "ToolCallDelta": {
                 "type": "object",
                 "properties": {
-                    "violation_level": {
-                        "$ref": "#/components/schemas/ViolationLevel"
+                    "type": {
+                        "type": "string",
+                        "const": "tool_call",
+                        "default": "tool_call"
                     },
-                    "user_message": {
-                        "type": "string"
+                    "tool_call": {
+                        "oneOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "$ref": "#/components/schemas/ToolCall"
+                            }
+                        ]
                     },
-                    "metadata": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        }
+                    "parse_status": {
+                        "type": "string",
+                        "enum": [
+                            "started",
+                            "in_progress",
+                            "failed",
+                            "succeeded"
+                        ]
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "violation_level",
-                    "metadata"
+                    "type",
+                    "tool_call",
+                    "parse_status"
                 ]
             },
-            "ShieldCallStep": {
+            "CompletionRequest": {
                 "type": "object",
                 "properties": {
-                    "turn_id": {
-                        "type": "string"
+                    "model_id": {
+                        "type": "string",
+                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
                     },
-                    "step_id": {
-                        "type": "string"
+                    "content": {
+                        "$ref": "#/components/schemas/InterleavedContent",
+                        "description": "The content to generate a completion for"
                     },
-                    "started_at": {
-                        "type": "string",
-                        "format": "date-time"
+                    "sampling_params": {
+                        "$ref": "#/components/schemas/SamplingParams",
+                        "description": "(Optional) Parameters to control the sampling strategy"
                     },
-                    "completed_at": {
-                        "type": "string",
-                        "format": "date-time"
+                    "response_format": {
+                        "$ref": "#/components/schemas/ResponseFormat",
+                        "description": "(Optional) Grammar specification for guided (structured) decoding"
                     },
-                    "step_type": {
-                        "type": "string",
-                        "const": "shield_call",
-                        "default": "shield_call"
+                    "stream": {
+                        "type": "boolean",
+                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
                     },
-                    "violation": {
-                        "$ref": "#/components/schemas/SafetyViolation"
+                    "logprobs": {
+                        "type": "object",
+                        "properties": {
+                            "top_k": {
+                                "type": "integer",
+                                "default": 0,
+                                "description": "How many tokens (for each position) to return log probabilities for."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "turn_id",
-                    "step_id",
-                    "step_type"
+                    "model_id",
+                    "content"
                 ]
             },
-            "ToolExecutionStep": {
+            "CompletionResponseStreamChunk": {
                 "type": "object",
                 "properties": {
-                    "turn_id": {
-                        "type": "string"
-                    },
-                    "step_id": {
-                        "type": "string"
-                    },
-                    "started_at": {
-                        "type": "string",
-                        "format": "date-time"
-                    },
-                    "completed_at": {
+                    "delta": {
                         "type": "string",
-                        "format": "date-time"
+                        "description": "New content generated since last chunk. This can be one or more tokens."
                     },
-                    "step_type": {
+                    "stop_reason": {
                         "type": "string",
-                        "const": "tool_execution",
-                        "default": "tool_execution"
-                    },
-                    "tool_calls": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/ToolCall"
-                        }
+                        "enum": [
+                            "end_of_turn",
+                            "end_of_message",
+                            "out_of_tokens"
+                        ],
+                        "description": "Optional reason why generation stopped, if complete"
                     },
-                    "tool_responses": {
+                    "logprobs": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/ToolResponse"
-                        }
+                            "$ref": "#/components/schemas/TokenLogProbs"
+                        },
+                        "description": "Optional log probabilities for generated tokens"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "turn_id",
-                    "step_id",
-                    "step_type",
-                    "tool_calls",
-                    "tool_responses"
+                    "delta"
+                ],
+                "description": "A chunk of a streamed completion response."
+            },
+            "CreateAgentRequest": {
+                "type": "object",
+                "properties": {
+                    "agent_config": {
+                        "$ref": "#/components/schemas/AgentConfig"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "agent_config"
                 ]
             },
-            "ToolResponse": {
+            "AgentCreateResponse": {
                 "type": "object",
                 "properties": {
-                    "call_id": {
+                    "agent_id": {
                         "type": "string"
-                    },
-                    "tool_name": {
-                        "oneOf": [
-                            {
-                                "type": "string",
-                                "enum": [
-                                    "brave_search",
-                                    "wolfram_alpha",
-                                    "photogen",
-                                    "code_interpreter"
-                                ]
-                            },
-                            {
-                                "type": "string"
-                            }
-                        ]
-                    },
-                    "content": {
-                        "$ref": "#/components/schemas/InterleavedContent"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "call_id",
-                    "tool_name",
-                    "content"
+                    "agent_id"
                 ]
             },
-            "Turn": {
+            "CreateAgentSessionRequest": {
                 "type": "object",
                 "properties": {
-                    "turn_id": {
+                    "session_name": {
                         "type": "string"
-                    },
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "session_name"
+                ]
+            },
+            "AgentSessionCreateResponse": {
+                "type": "object",
+                "properties": {
                     "session_id": {
                         "type": "string"
-                    },
-                    "input_messages": {
-                        "type": "array",
-                        "items": {
-                            "oneOf": [
-                                {
-                                    "$ref": "#/components/schemas/UserMessage"
-                                },
-                                {
-                                    "$ref": "#/components/schemas/ToolResponseMessage"
-                                }
-                            ]
-                        }
-                    },
-                    "steps": {
-                        "type": "array",
-                        "items": {
-                            "oneOf": [
-                                {
-                                    "$ref": "#/components/schemas/InferenceStep"
-                                },
-                                {
-                                    "$ref": "#/components/schemas/ToolExecutionStep"
-                                },
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "session_id"
+                ]
+            },
+            "CreateAgentTurnRequest": {
+                "type": "object",
+                "properties": {
+                    "messages": {
+                        "type": "array",
+                        "items": {
+                            "oneOf": [
                                 {
-                                    "$ref": "#/components/schemas/ShieldCallStep"
+                                    "$ref": "#/components/schemas/UserMessage"
                                 },
                                 {
-                                    "$ref": "#/components/schemas/MemoryRetrievalStep"
-                                }
-                            ],
-                            "discriminator": {
-                                "propertyName": "step_type",
-                                "mapping": {
-                                    "inference": "#/components/schemas/InferenceStep",
-                                    "tool_execution": "#/components/schemas/ToolExecutionStep",
-                                    "shield_call": "#/components/schemas/ShieldCallStep",
-                                    "memory_retrieval": "#/components/schemas/MemoryRetrievalStep"
+                                    "$ref": "#/components/schemas/ToolResponseMessage"
                                 }
-                            }
+                            ]
                         }
                     },
-                    "output_message": {
-                        "$ref": "#/components/schemas/CompletionMessage"
+                    "stream": {
+                        "type": "boolean"
                     },
-                    "output_attachments": {
+                    "documents": {
                         "type": "array",
                         "items": {
                             "type": "object",
@@ -4284,179 +4755,100 @@
                             ]
                         }
                     },
-                    "started_at": {
-                        "type": "string",
-                        "format": "date-time"
+                    "toolgroups": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/AgentTool"
+                        }
                     },
-                    "completed_at": {
-                        "type": "string",
-                        "format": "date-time"
+                    "tool_config": {
+                        "$ref": "#/components/schemas/ToolConfig"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "turn_id",
-                    "session_id",
-                    "input_messages",
-                    "steps",
-                    "output_message",
-                    "started_at"
-                ],
-                "description": "A single turn in an interaction with an Agentic System."
-            },
-            "ViolationLevel": {
-                "type": "string",
-                "enum": [
-                    "info",
-                    "warn",
-                    "error"
+                    "messages"
                 ]
             },
-            "AgentTurnResponseEvent": {
+            "InferenceStep": {
                 "type": "object",
                 "properties": {
-                    "payload": {
-                        "$ref": "#/components/schemas/AgentTurnResponseEventPayload"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "payload"
-                ]
-            },
-            "AgentTurnResponseEventPayload": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/AgentTurnResponseStepStartPayload"
-                    },
-                    {
-                        "$ref": "#/components/schemas/AgentTurnResponseStepProgressPayload"
+                    "turn_id": {
+                        "type": "string"
                     },
-                    {
-                        "$ref": "#/components/schemas/AgentTurnResponseStepCompletePayload"
+                    "step_id": {
+                        "type": "string"
                     },
-                    {
-                        "$ref": "#/components/schemas/AgentTurnResponseTurnStartPayload"
+                    "started_at": {
+                        "type": "string",
+                        "format": "date-time"
                     },
-                    {
-                        "$ref": "#/components/schemas/AgentTurnResponseTurnCompletePayload"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "event_type",
-                    "mapping": {
-                        "step_start": "#/components/schemas/AgentTurnResponseStepStartPayload",
-                        "step_progress": "#/components/schemas/AgentTurnResponseStepProgressPayload",
-                        "step_complete": "#/components/schemas/AgentTurnResponseStepCompletePayload",
-                        "turn_start": "#/components/schemas/AgentTurnResponseTurnStartPayload",
-                        "turn_complete": "#/components/schemas/AgentTurnResponseTurnCompletePayload"
-                    }
-                }
-            },
-            "AgentTurnResponseStepCompletePayload": {
-                "type": "object",
-                "properties": {
-                    "event_type": {
+                    "completed_at": {
                         "type": "string",
-                        "const": "step_complete",
-                        "default": "step_complete"
+                        "format": "date-time"
                     },
                     "step_type": {
                         "type": "string",
-                        "enum": [
-                            "inference",
-                            "tool_execution",
-                            "shield_call",
-                            "memory_retrieval"
-                        ]
-                    },
-                    "step_id": {
-                        "type": "string"
+                        "const": "inference",
+                        "default": "inference"
                     },
-                    "step_details": {
-                        "oneOf": [
-                            {
-                                "$ref": "#/components/schemas/InferenceStep"
-                            },
-                            {
-                                "$ref": "#/components/schemas/ToolExecutionStep"
-                            },
-                            {
-                                "$ref": "#/components/schemas/ShieldCallStep"
-                            },
-                            {
-                                "$ref": "#/components/schemas/MemoryRetrievalStep"
-                            }
-                        ],
-                        "discriminator": {
-                            "propertyName": "step_type",
-                            "mapping": {
-                                "inference": "#/components/schemas/InferenceStep",
-                                "tool_execution": "#/components/schemas/ToolExecutionStep",
-                                "shield_call": "#/components/schemas/ShieldCallStep",
-                                "memory_retrieval": "#/components/schemas/MemoryRetrievalStep"
-                            }
-                        }
+                    "model_response": {
+                        "$ref": "#/components/schemas/CompletionMessage"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "event_type",
-                    "step_type",
+                    "turn_id",
                     "step_id",
-                    "step_details"
+                    "step_type",
+                    "model_response"
                 ]
             },
-            "AgentTurnResponseStepProgressPayload": {
+            "MemoryRetrievalStep": {
                 "type": "object",
                 "properties": {
-                    "event_type": {
+                    "turn_id": {
+                        "type": "string"
+                    },
+                    "step_id": {
+                        "type": "string"
+                    },
+                    "started_at": {
                         "type": "string",
-                        "const": "step_progress",
-                        "default": "step_progress"
+                        "format": "date-time"
+                    },
+                    "completed_at": {
+                        "type": "string",
+                        "format": "date-time"
                     },
                     "step_type": {
                         "type": "string",
-                        "enum": [
-                            "inference",
-                            "tool_execution",
-                            "shield_call",
-                            "memory_retrieval"
-                        ]
+                        "const": "memory_retrieval",
+                        "default": "memory_retrieval"
                     },
-                    "step_id": {
+                    "vector_db_ids": {
                         "type": "string"
                     },
-                    "delta": {
-                        "$ref": "#/components/schemas/ContentDelta"
+                    "inserted_context": {
+                        "$ref": "#/components/schemas/InterleavedContent"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "event_type",
-                    "step_type",
+                    "turn_id",
                     "step_id",
-                    "delta"
+                    "step_type",
+                    "vector_db_ids",
+                    "inserted_context"
                 ]
             },
-            "AgentTurnResponseStepStartPayload": {
+            "SafetyViolation": {
                 "type": "object",
                 "properties": {
-                    "event_type": {
-                        "type": "string",
-                        "const": "step_start",
-                        "default": "step_start"
-                    },
-                    "step_type": {
-                        "type": "string",
-                        "enum": [
-                            "inference",
-                            "tool_execution",
-                            "shield_call",
-                            "memory_retrieval"
-                        ]
+                    "violation_level": {
+                        "$ref": "#/components/schemas/ViolationLevel"
                     },
-                    "step_id": {
+                    "user_message": {
                         "type": "string"
                     },
                     "metadata": {
@@ -4487,432 +4879,510 @@
                 },
                 "additionalProperties": false,
                 "required": [
-                    "event_type",
-                    "step_type",
-                    "step_id"
+                    "violation_level",
+                    "metadata"
                 ]
             },
-            "AgentTurnResponseStreamChunk": {
-                "type": "object",
-                "properties": {
-                    "event": {
-                        "$ref": "#/components/schemas/AgentTurnResponseEvent"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "event"
-                ],
-                "description": "streamed agent turn completion response."
-            },
-            "AgentTurnResponseTurnCompletePayload": {
+            "ShieldCallStep": {
                 "type": "object",
                 "properties": {
-                    "event_type": {
+                    "turn_id": {
+                        "type": "string"
+                    },
+                    "step_id": {
+                        "type": "string"
+                    },
+                    "started_at": {
                         "type": "string",
-                        "const": "turn_complete",
-                        "default": "turn_complete"
+                        "format": "date-time"
                     },
-                    "turn": {
-                        "$ref": "#/components/schemas/Turn"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "event_type",
-                    "turn"
-                ]
-            },
-            "AgentTurnResponseTurnStartPayload": {
-                "type": "object",
-                "properties": {
-                    "event_type": {
+                    "completed_at": {
+                        "type": "string",
+                        "format": "date-time"
+                    },
+                    "step_type": {
                         "type": "string",
-                        "const": "turn_start",
-                        "default": "turn_start"
+                        "const": "shield_call",
+                        "default": "shield_call"
                     },
-                    "turn_id": {
-                        "type": "string"
+                    "violation": {
+                        "$ref": "#/components/schemas/SafetyViolation"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "event_type",
-                    "turn_id"
+                    "turn_id",
+                    "step_id",
+                    "step_type"
                 ]
             },
-            "EmbeddingsRequest": {
+            "ToolExecutionStep": {
                 "type": "object",
                 "properties": {
-                    "model_id": {
+                    "turn_id": {
+                        "type": "string"
+                    },
+                    "step_id": {
+                        "type": "string"
+                    },
+                    "started_at": {
                         "type": "string",
-                        "description": "The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint."
+                        "format": "date-time"
                     },
-                    "contents": {
+                    "completed_at": {
+                        "type": "string",
+                        "format": "date-time"
+                    },
+                    "step_type": {
+                        "type": "string",
+                        "const": "tool_execution",
+                        "default": "tool_execution"
+                    },
+                    "tool_calls": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/InterleavedContent"
-                        },
-                        "description": "List of contents to generate embeddings for. Note that content can be multimodal. The behavior depends on the model and provider. Some models may only support text."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "model_id",
-                    "contents"
-                ]
-            },
-            "EmbeddingsResponse": {
-                "type": "object",
-                "properties": {
-                    "embeddings": {
+                            "$ref": "#/components/schemas/ToolCall"
+                        }
+                    },
+                    "tool_responses": {
                         "type": "array",
                         "items": {
-                            "type": "array",
-                            "items": {
-                                "type": "number"
-                            }
-                        },
-                        "description": "List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}"
+                            "$ref": "#/components/schemas/ToolResponse"
+                        }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "embeddings"
-                ],
-                "description": "Response containing generated embeddings."
+                    "turn_id",
+                    "step_id",
+                    "step_type",
+                    "tool_calls",
+                    "tool_responses"
+                ]
             },
-            "AgentCandidate": {
+            "ToolResponse": {
                 "type": "object",
                 "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "agent",
-                        "default": "agent"
+                    "call_id": {
+                        "type": "string"
                     },
-                    "config": {
-                        "$ref": "#/components/schemas/AgentConfig"
+                    "tool_name": {
+                        "oneOf": [
+                            {
+                                "type": "string",
+                                "enum": [
+                                    "brave_search",
+                                    "wolfram_alpha",
+                                    "photogen",
+                                    "code_interpreter"
+                                ]
+                            },
+                            {
+                                "type": "string"
+                            }
+                        ]
+                    },
+                    "content": {
+                        "$ref": "#/components/schemas/InterleavedContent"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "type",
-                    "config"
-                ]
-            },
-            "AggregationFunctionType": {
-                "type": "string",
-                "enum": [
-                    "average",
-                    "median",
-                    "categorical_count",
-                    "accuracy"
+                    "call_id",
+                    "tool_name",
+                    "content"
                 ]
             },
-            "AppEvalTaskConfig": {
+            "Turn": {
                 "type": "object",
                 "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "app",
-                        "default": "app"
+                    "turn_id": {
+                        "type": "string"
                     },
-                    "eval_candidate": {
-                        "$ref": "#/components/schemas/EvalCandidate"
+                    "session_id": {
+                        "type": "string"
                     },
-                    "scoring_params": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ScoringFnParams"
+                    "input_messages": {
+                        "type": "array",
+                        "items": {
+                            "oneOf": [
+                                {
+                                    "$ref": "#/components/schemas/UserMessage"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/ToolResponseMessage"
+                                }
+                            ]
                         }
                     },
-                    "num_examples": {
-                        "type": "integer"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "eval_candidate",
-                    "scoring_params"
-                ]
-            },
-            "BasicScoringFnParams": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "basic",
-                        "default": "basic"
+                    "steps": {
+                        "type": "array",
+                        "items": {
+                            "oneOf": [
+                                {
+                                    "$ref": "#/components/schemas/InferenceStep"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/ToolExecutionStep"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/ShieldCallStep"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/MemoryRetrievalStep"
+                                }
+                            ],
+                            "discriminator": {
+                                "propertyName": "step_type",
+                                "mapping": {
+                                    "inference": "#/components/schemas/InferenceStep",
+                                    "tool_execution": "#/components/schemas/ToolExecutionStep",
+                                    "shield_call": "#/components/schemas/ShieldCallStep",
+                                    "memory_retrieval": "#/components/schemas/MemoryRetrievalStep"
+                                }
+                            }
+                        }
                     },
-                    "aggregation_functions": {
+                    "output_message": {
+                        "$ref": "#/components/schemas/CompletionMessage"
+                    },
+                    "output_attachments": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/AggregationFunctionType"
+                            "type": "object",
+                            "properties": {
+                                "content": {
+                                    "oneOf": [
+                                        {
+                                            "type": "string"
+                                        },
+                                        {
+                                            "$ref": "#/components/schemas/InterleavedContentItem"
+                                        },
+                                        {
+                                            "type": "array",
+                                            "items": {
+                                                "$ref": "#/components/schemas/InterleavedContentItem"
+                                            }
+                                        },
+                                        {
+                                            "$ref": "#/components/schemas/URL"
+                                        }
+                                    ]
+                                },
+                                "mime_type": {
+                                    "type": "string"
+                                }
+                            },
+                            "additionalProperties": false,
+                            "required": [
+                                "content",
+                                "mime_type"
+                            ]
                         }
+                    },
+                    "started_at": {
+                        "type": "string",
+                        "format": "date-time"
+                    },
+                    "completed_at": {
+                        "type": "string",
+                        "format": "date-time"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "type"
+                    "turn_id",
+                    "session_id",
+                    "input_messages",
+                    "steps",
+                    "output_message",
+                    "started_at"
+                ],
+                "description": "A single turn in an interaction with an Agentic System."
+            },
+            "ViolationLevel": {
+                "type": "string",
+                "enum": [
+                    "info",
+                    "warn",
+                    "error"
                 ]
             },
-            "BenchmarkEvalTaskConfig": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "benchmark",
-                        "default": "benchmark"
-                    },
-                    "eval_candidate": {
-                        "$ref": "#/components/schemas/EvalCandidate"
-                    },
-                    "num_examples": {
-                        "type": "integer"
+            "AgentTurnResponseEvent": {
+                "type": "object",
+                "properties": {
+                    "payload": {
+                        "$ref": "#/components/schemas/AgentTurnResponseEventPayload"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "type",
-                    "eval_candidate"
+                    "payload"
                 ]
             },
-            "EvalCandidate": {
+            "AgentTurnResponseEventPayload": {
                 "oneOf": [
                     {
-                        "$ref": "#/components/schemas/ModelCandidate"
+                        "$ref": "#/components/schemas/AgentTurnResponseStepStartPayload"
                     },
                     {
-                        "$ref": "#/components/schemas/AgentCandidate"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "model": "#/components/schemas/ModelCandidate",
-                        "agent": "#/components/schemas/AgentCandidate"
-                    }
-                }
-            },
-            "EvalTaskConfig": {
-                "oneOf": [
+                        "$ref": "#/components/schemas/AgentTurnResponseStepProgressPayload"
+                    },
+                    {
+                        "$ref": "#/components/schemas/AgentTurnResponseStepCompletePayload"
+                    },
                     {
-                        "$ref": "#/components/schemas/BenchmarkEvalTaskConfig"
+                        "$ref": "#/components/schemas/AgentTurnResponseTurnStartPayload"
                     },
                     {
-                        "$ref": "#/components/schemas/AppEvalTaskConfig"
+                        "$ref": "#/components/schemas/AgentTurnResponseTurnCompletePayload"
                     }
                 ],
                 "discriminator": {
-                    "propertyName": "type",
+                    "propertyName": "event_type",
                     "mapping": {
-                        "benchmark": "#/components/schemas/BenchmarkEvalTaskConfig",
-                        "app": "#/components/schemas/AppEvalTaskConfig"
+                        "step_start": "#/components/schemas/AgentTurnResponseStepStartPayload",
+                        "step_progress": "#/components/schemas/AgentTurnResponseStepProgressPayload",
+                        "step_complete": "#/components/schemas/AgentTurnResponseStepCompletePayload",
+                        "turn_start": "#/components/schemas/AgentTurnResponseTurnStartPayload",
+                        "turn_complete": "#/components/schemas/AgentTurnResponseTurnCompletePayload"
                     }
                 }
             },
-            "LLMAsJudgeScoringFnParams": {
+            "AgentTurnResponseStepCompletePayload": {
                 "type": "object",
                 "properties": {
-                    "type": {
+                    "event_type": {
                         "type": "string",
-                        "const": "llm_as_judge",
-                        "default": "llm_as_judge"
+                        "const": "step_complete",
+                        "default": "step_complete"
                     },
-                    "judge_model": {
-                        "type": "string"
+                    "step_type": {
+                        "type": "string",
+                        "enum": [
+                            "inference",
+                            "tool_execution",
+                            "shield_call",
+                            "memory_retrieval"
+                        ]
                     },
-                    "prompt_template": {
+                    "step_id": {
                         "type": "string"
                     },
-                    "judge_score_regexes": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        }
-                    },
-                    "aggregation_functions": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/AggregationFunctionType"
+                    "step_details": {
+                        "oneOf": [
+                            {
+                                "$ref": "#/components/schemas/InferenceStep"
+                            },
+                            {
+                                "$ref": "#/components/schemas/ToolExecutionStep"
+                            },
+                            {
+                                "$ref": "#/components/schemas/ShieldCallStep"
+                            },
+                            {
+                                "$ref": "#/components/schemas/MemoryRetrievalStep"
+                            }
+                        ],
+                        "discriminator": {
+                            "propertyName": "step_type",
+                            "mapping": {
+                                "inference": "#/components/schemas/InferenceStep",
+                                "tool_execution": "#/components/schemas/ToolExecutionStep",
+                                "shield_call": "#/components/schemas/ShieldCallStep",
+                                "memory_retrieval": "#/components/schemas/MemoryRetrievalStep"
+                            }
                         }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "type",
-                    "judge_model"
+                    "event_type",
+                    "step_type",
+                    "step_id",
+                    "step_details"
                 ]
             },
-            "ModelCandidate": {
+            "AgentTurnResponseStepProgressPayload": {
                 "type": "object",
                 "properties": {
-                    "type": {
+                    "event_type": {
                         "type": "string",
-                        "const": "model",
-                        "default": "model"
+                        "const": "step_progress",
+                        "default": "step_progress"
                     },
-                    "model": {
-                        "type": "string"
+                    "step_type": {
+                        "type": "string",
+                        "enum": [
+                            "inference",
+                            "tool_execution",
+                            "shield_call",
+                            "memory_retrieval"
+                        ]
                     },
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams"
+                    "step_id": {
+                        "type": "string"
                     },
-                    "system_message": {
-                        "$ref": "#/components/schemas/SystemMessage"
+                    "delta": {
+                        "$ref": "#/components/schemas/ContentDelta"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "type",
-                    "model",
-                    "sampling_params"
+                    "event_type",
+                    "step_type",
+                    "step_id",
+                    "delta"
                 ]
             },
-            "RegexParserScoringFnParams": {
+            "AgentTurnResponseStepStartPayload": {
                 "type": "object",
                 "properties": {
-                    "type": {
+                    "event_type": {
                         "type": "string",
-                        "const": "regex_parser",
-                        "default": "regex_parser"
+                        "const": "step_start",
+                        "default": "step_start"
                     },
-                    "parsing_regexes": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        }
+                    "step_type": {
+                        "type": "string",
+                        "enum": [
+                            "inference",
+                            "tool_execution",
+                            "shield_call",
+                            "memory_retrieval"
+                        ]
                     },
-                    "aggregation_functions": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/AggregationFunctionType"
+                    "step_id": {
+                        "type": "string"
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
                         }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "type"
+                    "event_type",
+                    "step_type",
+                    "step_id"
                 ]
             },
-            "ScoringFnParams": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/LLMAsJudgeScoringFnParams"
-                    },
-                    {
-                        "$ref": "#/components/schemas/RegexParserScoringFnParams"
-                    },
-                    {
-                        "$ref": "#/components/schemas/BasicScoringFnParams"
+            "AgentTurnResponseStreamChunk": {
+                "type": "object",
+                "properties": {
+                    "event": {
+                        "$ref": "#/components/schemas/AgentTurnResponseEvent"
                     }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "event"
                 ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams",
-                        "regex_parser": "#/components/schemas/RegexParserScoringFnParams",
-                        "basic": "#/components/schemas/BasicScoringFnParams"
-                    }
-                }
+                "description": "streamed agent turn completion response."
             },
-            "EvaluateRowsRequest": {
+            "AgentTurnResponseTurnCompletePayload": {
                 "type": "object",
                 "properties": {
-                    "input_rows": {
-                        "type": "array",
-                        "items": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "null"
-                                    },
-                                    {
-                                        "type": "boolean"
-                                    },
-                                    {
-                                        "type": "number"
-                                    },
-                                    {
-                                        "type": "string"
-                                    },
-                                    {
-                                        "type": "array"
-                                    },
-                                    {
-                                        "type": "object"
-                                    }
-                                ]
-                            }
-                        }
+                    "event_type": {
+                        "type": "string",
+                        "const": "turn_complete",
+                        "default": "turn_complete"
+                    },
+                    "turn": {
+                        "$ref": "#/components/schemas/Turn"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "event_type",
+                    "turn"
+                ]
+            },
+            "AgentTurnResponseTurnStartPayload": {
+                "type": "object",
+                "properties": {
+                    "event_type": {
+                        "type": "string",
+                        "const": "turn_start",
+                        "default": "turn_start"
                     },
-                    "scoring_functions": {
+                    "turn_id": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "event_type",
+                    "turn_id"
+                ]
+            },
+            "EmbeddingsRequest": {
+                "type": "object",
+                "properties": {
+                    "model_id": {
+                        "type": "string",
+                        "description": "The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint."
+                    },
+                    "contents": {
                         "type": "array",
                         "items": {
-                            "type": "string"
-                        }
-                    },
-                    "task_config": {
-                        "$ref": "#/components/schemas/EvalTaskConfig"
+                            "$ref": "#/components/schemas/InterleavedContent"
+                        },
+                        "description": "List of contents to generate embeddings for. Note that content can be multimodal. The behavior depends on the model and provider. Some models may only support text."
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "input_rows",
-                    "scoring_functions",
-                    "task_config"
+                    "model_id",
+                    "contents"
                 ]
             },
-            "EvaluateResponse": {
+            "EmbeddingsResponse": {
                 "type": "object",
                 "properties": {
-                    "generations": {
+                    "embeddings": {
                         "type": "array",
                         "items": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "null"
-                                    },
-                                    {
-                                        "type": "boolean"
-                                    },
-                                    {
-                                        "type": "number"
-                                    },
-                                    {
-                                        "type": "string"
-                                    },
-                                    {
-                                        "type": "array"
-                                    },
-                                    {
-                                        "type": "object"
-                                    }
-                                ]
+                            "type": "array",
+                            "items": {
+                                "type": "number"
                             }
-                        }
-                    },
-                    "scores": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ScoringResult"
-                        }
+                        },
+                        "description": "List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "generations",
-                    "scores"
-                ]
+                    "embeddings"
+                ],
+                "description": "Response containing generated embeddings."
             },
-            "ScoringResult": {
+            "EvaluateRowsRequest": {
                 "type": "object",
                 "properties": {
-                    "score_rows": {
+                    "input_rows": {
                         "type": "array",
                         "items": {
                             "type": "object",
@@ -4940,36 +5410,21 @@
                             }
                         }
                     },
-                    "aggregated_results": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
+                    "scoring_functions": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
                         }
+                    },
+                    "task_config": {
+                        "$ref": "#/components/schemas/BenchmarkConfig"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "score_rows",
-                    "aggregated_results"
+                    "input_rows",
+                    "scoring_functions",
+                    "task_config"
                 ]
             },
             "Session": {
@@ -5287,69 +5742,6 @@
                     "type"
                 ]
             },
-            "EvalTask": {
-                "type": "object",
-                "properties": {
-                    "identifier": {
-                        "type": "string"
-                    },
-                    "provider_resource_id": {
-                        "type": "string"
-                    },
-                    "provider_id": {
-                        "type": "string"
-                    },
-                    "type": {
-                        "type": "string",
-                        "const": "eval_task",
-                        "default": "eval_task"
-                    },
-                    "dataset_id": {
-                        "type": "string"
-                    },
-                    "scoring_functions": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        }
-                    },
-                    "metadata": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "identifier",
-                    "provider_resource_id",
-                    "provider_id",
-                    "type",
-                    "dataset_id",
-                    "scoring_functions",
-                    "metadata"
-                ]
-            },
             "Model": {
                 "type": "object",
                 "properties": {
@@ -5891,15 +6283,6 @@
                 ],
                 "description": "Artifacts of a finetuning job."
             },
-            "JobStatus": {
-                "type": "string",
-                "enum": [
-                    "completed",
-                    "in_progress",
-                    "failed",
-                    "scheduled"
-                ]
-            },
             "PostTrainingJobStatusResponse": {
                 "type": "object",
                 "properties": {
@@ -6243,21 +6626,6 @@
                     "data"
                 ]
             },
-            "ListEvalTasksResponse": {
-                "type": "object",
-                "properties": {
-                    "data": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/EvalTask"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "data"
-                ]
-            },
             "ListModelsResponse": {
                 "type": "object",
                 "properties": {
@@ -7169,22 +7537,22 @@
                     "data"
                 ]
             },
-            "RegisterDatasetRequest": {
+            "RegisterBenchmarkRequest": {
                 "type": "object",
                 "properties": {
+                    "benchmark_id": {
+                        "type": "string"
+                    },
                     "dataset_id": {
                         "type": "string"
                     },
-                    "dataset_schema": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ParamType"
+                    "scoring_functions": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
                         }
                     },
-                    "url": {
-                        "$ref": "#/components/schemas/URL"
-                    },
-                    "provider_dataset_id": {
+                    "provider_benchmark_id": {
                         "type": "string"
                     },
                     "provider_id": {
@@ -7218,27 +7586,27 @@
                 },
                 "additionalProperties": false,
                 "required": [
+                    "benchmark_id",
                     "dataset_id",
-                    "dataset_schema",
-                    "url"
+                    "scoring_functions"
                 ]
             },
-            "RegisterEvalTaskRequest": {
+            "RegisterDatasetRequest": {
                 "type": "object",
                 "properties": {
-                    "eval_task_id": {
-                        "type": "string"
-                    },
                     "dataset_id": {
                         "type": "string"
                     },
-                    "scoring_functions": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
+                    "dataset_schema": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/ParamType"
                         }
                     },
-                    "provider_eval_task_id": {
+                    "url": {
+                        "$ref": "#/components/schemas/URL"
+                    },
+                    "provider_dataset_id": {
                         "type": "string"
                     },
                     "provider_id": {
@@ -7272,9 +7640,9 @@
                 },
                 "additionalProperties": false,
                 "required": [
-                    "eval_task_id",
                     "dataset_id",
-                    "scoring_functions"
+                    "dataset_schema",
+                    "url"
                 ]
             },
             "RegisterModelRequest": {
@@ -7468,7 +7836,7 @@
                 "type": "object",
                 "properties": {
                     "task_config": {
-                        "$ref": "#/components/schemas/EvalTaskConfig"
+                        "$ref": "#/components/schemas/BenchmarkConfig"
                     }
                 },
                 "additionalProperties": false,
@@ -7476,18 +7844,6 @@
                     "task_config"
                 ]
             },
-            "Job": {
-                "type": "object",
-                "properties": {
-                    "job_id": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "job_id"
-                ]
-            },
             "RunShieldRequest": {
                 "type": "object",
                 "properties": {
@@ -7970,6 +8326,9 @@
         {
             "name": "BatchInference (Coming Soon)"
         },
+        {
+            "name": "Benchmarks"
+        },
         {
             "name": "DatasetIO"
         },
@@ -7979,9 +8338,6 @@
         {
             "name": "Eval"
         },
-        {
-            "name": "EvalTasks"
-        },
         {
             "name": "Inference",
             "description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
@@ -8033,10 +8389,10 @@
             "tags": [
                 "Agents",
                 "BatchInference (Coming Soon)",
+                "Benchmarks",
                 "DatasetIO",
                 "Datasets",
                 "Eval",
-                "EvalTasks",
                 "Inference",
                 "Inspect",
                 "Models",
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index a646d7e089..b30025020b 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -10,6 +10,175 @@ info:
 servers:
   - url: http://any-hosted-llama-stack.com
 paths:
+  /v1/eval/tasks/{task_id}/evaluations:
+    post:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/EvaluateResponse'
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/DeprecatedEvaluateRowsRequest'
+        required: true
+      deprecated: true
+  /v1/eval-tasks/{task_id}:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: '#/components/schemas/Benchmark'
+                  - type: 'null'
+      tags:
+        - Benchmarks
+      description: ''
+      parameters:
+        - name: eval_task_id
+          in: query
+          required: true
+          schema:
+            type: string
+      deprecated: true
+  /v1/eval/tasks/{task_id}/jobs/{job_id}:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: '#/components/schemas/JobStatus'
+                  - type: 'null'
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+      deprecated: true
+    delete:
+      responses:
+        '200':
+          description: OK
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+      deprecated: true
+  /v1/eval/tasks/{task_id}/jobs/{job_id}/result:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/EvaluateResponse'
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+      deprecated: true
+  /v1/eval-tasks:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ListBenchmarksResponse'
+      tags:
+        - Benchmarks
+      description: ''
+      parameters: []
+      deprecated: true
+    post:
+      responses:
+        '200':
+          description: OK
+      tags:
+        - Benchmarks
+      description: ''
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/DeprecatedRegisterEvalTaskRequest'
+        required: true
+      deprecated: true
+  /v1/eval/tasks/{task_id}/jobs:
+    post:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Job'
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/DeprecatedRunEvalRequest'
+        required: true
+      deprecated: true
   /v1/datasetio/rows:
     get:
       responses:
@@ -322,7 +491,7 @@ paths:
             schema:
               $ref: '#/components/schemas/EmbeddingsRequest'
         required: true
-  /v1/eval/tasks/{task_id}/evaluations:
+  /v1/eval/benchmarks/{benchmark_id}/evaluations:
     post:
       responses:
         '200':
@@ -335,7 +504,7 @@ paths:
         - Eval
       description: ''
       parameters:
-        - name: task_id
+        - name: benchmark_id
           in: path
           required: true
           schema:
@@ -407,7 +576,7 @@ paths:
           required: true
           schema:
             type: string
-  /v1/datasets/{dataset_id}:
+  /v1/eval/benchmarks/{benchmark_id}:
     get:
       responses:
         '200':
@@ -416,21 +585,28 @@ paths:
             application/json:
               schema:
                 oneOf:
-                  - $ref: '#/components/schemas/Dataset'
+                  - $ref: '#/components/schemas/Benchmark'
                   - type: 'null'
       tags:
-        - Datasets
+        - Benchmarks
       description: ''
       parameters:
-        - name: dataset_id
+        - name: benchmark_id
           in: path
           required: true
           schema:
             type: string
-    delete:
+  /v1/datasets/{dataset_id}:
+    get:
       responses:
         '200':
           description: OK
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: '#/components/schemas/Dataset'
+                  - type: 'null'
       tags:
         - Datasets
       description: ''
@@ -440,22 +616,15 @@ paths:
           required: true
           schema:
             type: string
-  /v1/eval-tasks/{eval_task_id}:
-    get:
+    delete:
       responses:
         '200':
           description: OK
-          content:
-            application/json:
-              schema:
-                oneOf:
-                  - $ref: '#/components/schemas/EvalTask'
-                  - type: 'null'
       tags:
-        - EvalTasks
+        - Datasets
       description: ''
       parameters:
-        - name: eval_task_id
+        - name: dataset_id
           in: path
           required: true
           schema:
@@ -802,7 +971,7 @@ paths:
             schema:
               $ref: '#/components/schemas/InvokeToolRequest'
         required: true
-  /v1/eval/tasks/{task_id}/jobs/{job_id}:
+  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
     get:
       responses:
         '200':
@@ -817,7 +986,7 @@ paths:
         - Eval
       description: ''
       parameters:
-        - name: task_id
+        - name: benchmark_id
           in: path
           required: true
           schema:
@@ -835,7 +1004,7 @@ paths:
         - Eval
       description: ''
       parameters:
-        - name: task_id
+        - name: benchmark_id
           in: path
           required: true
           schema:
@@ -845,7 +1014,7 @@ paths:
           required: true
           schema:
             type: string
-  /v1/eval/tasks/{task_id}/jobs/{job_id}/result:
+  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
     get:
       responses:
         '200':
@@ -858,17 +1027,17 @@ paths:
         - Eval
       description: ''
       parameters:
-        - name: job_id
+        - name: benchmark_id
           in: path
           required: true
           schema:
             type: string
-        - name: task_id
+        - name: job_id
           in: path
           required: true
           schema:
             type: string
-  /v1/datasets:
+  /v1/eval/benchmarks:
     get:
       responses:
         '200':
@@ -876,9 +1045,9 @@ paths:
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/ListDatasetsResponse'
+                $ref: '#/components/schemas/ListBenchmarksResponse'
       tags:
-        - Datasets
+        - Benchmarks
       description: ''
       parameters: []
     post:
@@ -886,16 +1055,16 @@ paths:
         '200':
           description: OK
       tags:
-        - Datasets
+        - Benchmarks
       description: ''
       parameters: []
       requestBody:
         content:
           application/json:
             schema:
-              $ref: '#/components/schemas/RegisterDatasetRequest'
+              $ref: '#/components/schemas/RegisterBenchmarkRequest'
         required: true
-  /v1/eval-tasks:
+  /v1/datasets:
     get:
       responses:
         '200':
@@ -903,9 +1072,9 @@ paths:
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/ListEvalTasksResponse'
+                $ref: '#/components/schemas/ListDatasetsResponse'
       tags:
-        - EvalTasks
+        - Datasets
       description: ''
       parameters: []
     post:
@@ -913,14 +1082,14 @@ paths:
         '200':
           description: OK
       tags:
-        - EvalTasks
+        - Datasets
       description: ''
       parameters: []
       requestBody:
         content:
           application/json:
             schema:
-              $ref: '#/components/schemas/RegisterEvalTaskRequest'
+              $ref: '#/components/schemas/RegisterDatasetRequest'
         required: true
   /v1/models:
     get:
@@ -1278,7 +1447,7 @@ paths:
             type: array
             items:
               type: string
-  /v1/eval/tasks/{task_id}/jobs:
+  /v1/eval/benchmarks/{benchmark_id}/jobs:
     post:
       responses:
         '200':
@@ -1291,7 +1460,7 @@ paths:
         - Eval
       description: ''
       parameters:
-        - name: task_id
+        - name: benchmark_id
           in: path
           required: true
           schema:
@@ -1429,65 +1598,146 @@ jsonSchemaDialect: >-
   https://json-schema.org/draft/2020-12/schema
 components:
   schemas:
-    AppendRowsRequest:
+    AgentCandidate:
       type: object
       properties:
-        dataset_id:
+        type:
           type: string
-        rows:
-          type: array
-          items:
-            type: object
-            additionalProperties:
-              oneOf:
-                - type: 'null'
-                - type: boolean
-                - type: number
-                - type: string
-                - type: array
-                - type: object
+          const: agent
+          default: agent
+        config:
+          $ref: '#/components/schemas/AgentConfig'
       additionalProperties: false
       required:
-        - dataset_id
-        - rows
-    CompletionMessage:
+        - type
+        - config
+    AgentConfig:
       type: object
       properties:
-        role:
-          type: string
-          const: assistant
-          default: assistant
-          description: >-
-            Must be "assistant" to identify this as the model's response
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: The content of the model's response
-        stop_reason:
-          type: string
-          enum:
-            - end_of_turn
-            - end_of_message
-            - out_of_tokens
-          description: >-
-            Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`:
-            The model finished generating the entire response. - `StopReason.end_of_message`:
-            The model finished generating but generated a partial response -- usually,
-            a tool call. The user may call the tool and continue the conversation
-            with the tool's response. - `StopReason.out_of_tokens`: The model ran
-            out of token budget.
-        tool_calls:
+        sampling_params:
+          $ref: '#/components/schemas/SamplingParams'
+        input_shields:
           type: array
           items:
-            $ref: '#/components/schemas/ToolCall'
-          description: >-
-            List of tool calls. Each tool call is a ToolCall object.
-      additionalProperties: false
-      required:
-        - role
-        - content
-        - stop_reason
-      description: >-
-        A message containing the model's (assistant) response in a chat conversation.
+            type: string
+        output_shields:
+          type: array
+          items:
+            type: string
+        toolgroups:
+          type: array
+          items:
+            $ref: '#/components/schemas/AgentTool'
+        client_tools:
+          type: array
+          items:
+            $ref: '#/components/schemas/ToolDef'
+        tool_choice:
+          type: string
+          enum:
+            - auto
+            - required
+          description: >-
+            Whether tool use is required or automatic. This is a hint to the model
+            which may not be followed. It depends on the Instruction Following capabilities
+            of the model.
+        tool_prompt_format:
+          type: string
+          enum:
+            - json
+            - function_tag
+            - python_list
+          description: >-
+            Prompt format for calling custom / zero shot tools.
+        tool_config:
+          $ref: '#/components/schemas/ToolConfig'
+        max_infer_iters:
+          type: integer
+          default: 10
+        model:
+          type: string
+        instructions:
+          type: string
+        enable_session_persistence:
+          type: boolean
+        response_format:
+          $ref: '#/components/schemas/ResponseFormat'
+      additionalProperties: false
+      required:
+        - model
+        - instructions
+        - enable_session_persistence
+    AgentTool:
+      oneOf:
+        - type: string
+        - type: object
+          properties:
+            name:
+              type: string
+            args:
+              type: object
+              additionalProperties:
+                oneOf:
+                  - type: 'null'
+                  - type: boolean
+                  - type: number
+                  - type: string
+                  - type: array
+                  - type: object
+          additionalProperties: false
+          required:
+            - name
+            - args
+    AggregationFunctionType:
+      type: string
+      enum:
+        - average
+        - median
+        - categorical_count
+        - accuracy
+    BasicScoringFnParams:
+      type: object
+      properties:
+        type:
+          type: string
+          const: basic
+          default: basic
+        aggregation_functions:
+          type: array
+          items:
+            $ref: '#/components/schemas/AggregationFunctionType'
+      additionalProperties: false
+      required:
+        - type
+    BenchmarkConfig:
+      type: object
+      properties:
+        type:
+          type: string
+          const: benchmark
+          default: benchmark
+        eval_candidate:
+          $ref: '#/components/schemas/EvalCandidate'
+        scoring_params:
+          type: object
+          additionalProperties:
+            $ref: '#/components/schemas/ScoringFnParams'
+        num_examples:
+          type: integer
+      additionalProperties: false
+      required:
+        - type
+        - eval_candidate
+        - scoring_params
+    EvalCandidate:
+      oneOf:
+        - $ref: '#/components/schemas/ModelCandidate'
+        - $ref: '#/components/schemas/AgentCandidate'
+      discriminator:
+        propertyName: type
+        mapping:
+          model: '#/components/schemas/ModelCandidate'
+          agent: '#/components/schemas/AgentCandidate'
     GrammarResponseFormat:
       type: object
       properties:
@@ -1598,19 +1848,65 @@ components:
         - json_schema
       description: >-
         Configuration for JSON schema-guided response generation.
-    Message:
-      oneOf:
-        - $ref: '#/components/schemas/UserMessage'
-        - $ref: '#/components/schemas/SystemMessage'
-        - $ref: '#/components/schemas/ToolResponseMessage'
-        - $ref: '#/components/schemas/CompletionMessage'
-      discriminator:
-        propertyName: role
-        mapping:
-          user: '#/components/schemas/UserMessage'
-          system: '#/components/schemas/SystemMessage'
-          tool: '#/components/schemas/ToolResponseMessage'
-          assistant: '#/components/schemas/CompletionMessage'
+    LLMAsJudgeScoringFnParams:
+      type: object
+      properties:
+        type:
+          type: string
+          const: llm_as_judge
+          default: llm_as_judge
+        judge_model:
+          type: string
+        prompt_template:
+          type: string
+        judge_score_regexes:
+          type: array
+          items:
+            type: string
+        aggregation_functions:
+          type: array
+          items:
+            $ref: '#/components/schemas/AggregationFunctionType'
+      additionalProperties: false
+      required:
+        - type
+        - judge_model
+    ModelCandidate:
+      type: object
+      properties:
+        type:
+          type: string
+          const: model
+          default: model
+        model:
+          type: string
+        sampling_params:
+          $ref: '#/components/schemas/SamplingParams'
+        system_message:
+          $ref: '#/components/schemas/SystemMessage'
+      additionalProperties: false
+      required:
+        - type
+        - model
+        - sampling_params
+    RegexParserScoringFnParams:
+      type: object
+      properties:
+        type:
+          type: string
+          const: regex_parser
+          default: regex_parser
+        parsing_regexes:
+          type: array
+          items:
+            type: string
+        aggregation_functions:
+          type: array
+          items:
+            $ref: '#/components/schemas/AggregationFunctionType'
+      additionalProperties: false
+      required:
+        - type
     ResponseFormat:
       oneOf:
         - $ref: '#/components/schemas/JsonSchemaResponseFormat'
@@ -1645,6 +1941,17 @@ components:
           greedy: '#/components/schemas/GreedySamplingStrategy'
           top_p: '#/components/schemas/TopPSamplingStrategy'
           top_k: '#/components/schemas/TopKSamplingStrategy'
+    ScoringFnParams:
+      oneOf:
+        - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
+        - $ref: '#/components/schemas/RegexParserScoringFnParams'
+        - $ref: '#/components/schemas/BasicScoringFnParams'
+      discriminator:
+        propertyName: type
+        mapping:
+          llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
+          regex_parser: '#/components/schemas/RegexParserScoringFnParams'
+          basic: '#/components/schemas/BasicScoringFnParams'
     SystemMessage:
       type: object
       properties:
@@ -1683,75 +1990,76 @@ components:
         - type
         - text
       description: A text content item
-    ToolCall:
+    ToolConfig:
       type: object
       properties:
-        call_id:
+        tool_choice:
+          type: string
+          enum:
+            - auto
+            - required
+          description: >-
+            (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
+          default: auto
+        tool_prompt_format:
+          type: string
+          enum:
+            - json
+            - function_tag
+            - python_list
+          description: >-
+            (Optional) Instructs the model how to format tool calls. By default, Llama
+            Stack will attempt to use a format that is best adapted to the model.
+            - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
+            - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name>
+            tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
+            syntax -- a list of function calls.
+        system_message_behavior:
+          type: string
+          enum:
+            - append
+            - replace
+          description: >-
+            (Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`:
+            Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`:
+            Replaces the default system prompt with the provided system message. The
+            system message can include the string '{{function_definitions}}' to indicate
+            where the function definitions should be inserted.
+          default: append
+      additionalProperties: false
+      required:
+        - system_message_behavior
+      description: Configuration for tool use.
+    ToolDef:
+      type: object
+      properties:
+        name:
           type: string
-        tool_name:
-          oneOf:
-            - type: string
-              enum:
-                - brave_search
-                - wolfram_alpha
-                - photogen
-                - code_interpreter
-            - type: string
-        arguments:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: string
-              - type: integer
-              - type: number
-              - type: boolean
-              - type: 'null'
-              - type: array
-                items:
-                  oneOf:
-                    - type: string
-                    - type: integer
-                    - type: number
-                    - type: boolean
-                    - type: 'null'
-              - type: object
-                additionalProperties:
-                  oneOf:
-                    - type: string
-                    - type: integer
-                    - type: number
-                    - type: boolean
-                    - type: 'null'
-      additionalProperties: false
-      required:
-        - call_id
-        - tool_name
-        - arguments
-    ToolDefinition:
-      type: object
-      properties:
-        tool_name:
-          oneOf:
-            - type: string
-              enum:
-                - brave_search
-                - wolfram_alpha
-                - photogen
-                - code_interpreter
-            - type: string
         description:
           type: string
         parameters:
+          type: array
+          items:
+            $ref: '#/components/schemas/ToolParameter'
+        metadata:
           type: object
           additionalProperties:
-            $ref: '#/components/schemas/ToolParamDefinition'
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
       additionalProperties: false
       required:
-        - tool_name
-    ToolParamDefinition:
+        - name
+    ToolParameter:
       type: object
       properties:
-        param_type:
+        name:
+          type: string
+        parameter_type:
           type: string
         description:
           type: string
@@ -1768,41 +2076,10 @@ components:
             - type: object
       additionalProperties: false
       required:
-        - param_type
-    ToolResponseMessage:
-      type: object
-      properties:
-        role:
-          type: string
-          const: tool
-          default: tool
-          description: >-
-            Must be "tool" to identify this as a tool response
-        call_id:
-          type: string
-          description: >-
-            Unique identifier for the tool call this response is for
-        tool_name:
-          oneOf:
-            - type: string
-              enum:
-                - brave_search
-                - wolfram_alpha
-                - photogen
-                - code_interpreter
-            - type: string
-          description: Name of the tool that was called
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: The response content from the tool
-      additionalProperties: false
-      required:
-        - role
-        - call_id
-        - tool_name
-        - content
-      description: >-
-        A message representing the result of a tool invocation.
+        - name
+        - parameter_type
+        - description
+        - required
     TopKSamplingStrategy:
       type: object
       properties:
@@ -1834,11 +2111,382 @@ components:
     URL:
       type: object
       properties:
-        uri:
+        uri:
+          type: string
+      additionalProperties: false
+      required:
+        - uri
+    DeprecatedEvaluateRowsRequest:
+      type: object
+      properties:
+        input_rows:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+        scoring_functions:
+          type: array
+          items:
+            type: string
+        task_config:
+          $ref: '#/components/schemas/BenchmarkConfig'
+      additionalProperties: false
+      required:
+        - input_rows
+        - scoring_functions
+        - task_config
+    EvaluateResponse:
+      type: object
+      properties:
+        generations:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+        scores:
+          type: object
+          additionalProperties:
+            $ref: '#/components/schemas/ScoringResult'
+      additionalProperties: false
+      required:
+        - generations
+        - scores
+    ScoringResult:
+      type: object
+      properties:
+        score_rows:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+        aggregated_results:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - score_rows
+        - aggregated_results
+    Benchmark:
+      type: object
+      properties:
+        identifier:
+          type: string
+        provider_resource_id:
+          type: string
+        provider_id:
+          type: string
+        type:
+          type: string
+          const: benchmark
+          default: benchmark
+        dataset_id:
+          type: string
+        scoring_functions:
+          type: array
+          items:
+            type: string
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - identifier
+        - provider_resource_id
+        - provider_id
+        - type
+        - dataset_id
+        - scoring_functions
+        - metadata
+    JobStatus:
+      type: string
+      enum:
+        - completed
+        - in_progress
+        - failed
+        - scheduled
+    ListBenchmarksResponse:
+      type: object
+      properties:
+        data:
+          type: array
+          items:
+            $ref: '#/components/schemas/Benchmark'
+      additionalProperties: false
+      required:
+        - data
+    DeprecatedRegisterEvalTaskRequest:
+      type: object
+      properties:
+        eval_task_id:
+          type: string
+        dataset_id:
+          type: string
+        scoring_functions:
+          type: array
+          items:
+            type: string
+        provider_benchmark_id:
+          type: string
+        provider_id:
+          type: string
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - eval_task_id
+        - dataset_id
+        - scoring_functions
+    DeprecatedRunEvalRequest:
+      type: object
+      properties:
+        task_config:
+          $ref: '#/components/schemas/BenchmarkConfig'
+      additionalProperties: false
+      required:
+        - task_config
+    Job:
+      type: object
+      properties:
+        job_id:
+          type: string
+      additionalProperties: false
+      required:
+        - job_id
+    AppendRowsRequest:
+      type: object
+      properties:
+        dataset_id:
+          type: string
+        rows:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+      additionalProperties: false
+      required:
+        - dataset_id
+        - rows
+    CompletionMessage:
+      type: object
+      properties:
+        role:
+          type: string
+          const: assistant
+          default: assistant
+          description: >-
+            Must be "assistant" to identify this as the model's response
+        content:
+          $ref: '#/components/schemas/InterleavedContent'
+          description: The content of the model's response
+        stop_reason:
+          type: string
+          enum:
+            - end_of_turn
+            - end_of_message
+            - out_of_tokens
+          description: >-
+            Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`:
+            The model finished generating the entire response. - `StopReason.end_of_message`:
+            The model finished generating but generated a partial response -- usually,
+            a tool call. The user may call the tool and continue the conversation
+            with the tool's response. - `StopReason.out_of_tokens`: The model ran
+            out of token budget.
+        tool_calls:
+          type: array
+          items:
+            $ref: '#/components/schemas/ToolCall'
+          description: >-
+            List of tool calls. Each tool call is a ToolCall object.
+      additionalProperties: false
+      required:
+        - role
+        - content
+        - stop_reason
+      description: >-
+        A message containing the model's (assistant) response in a chat conversation.
+    Message:
+      oneOf:
+        - $ref: '#/components/schemas/UserMessage'
+        - $ref: '#/components/schemas/SystemMessage'
+        - $ref: '#/components/schemas/ToolResponseMessage'
+        - $ref: '#/components/schemas/CompletionMessage'
+      discriminator:
+        propertyName: role
+        mapping:
+          user: '#/components/schemas/UserMessage'
+          system: '#/components/schemas/SystemMessage'
+          tool: '#/components/schemas/ToolResponseMessage'
+          assistant: '#/components/schemas/CompletionMessage'
+    ToolCall:
+      type: object
+      properties:
+        call_id:
+          type: string
+        tool_name:
+          oneOf:
+            - type: string
+              enum:
+                - brave_search
+                - wolfram_alpha
+                - photogen
+                - code_interpreter
+            - type: string
+        arguments:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: string
+              - type: integer
+              - type: number
+              - type: boolean
+              - type: 'null'
+              - type: array
+                items:
+                  oneOf:
+                    - type: string
+                    - type: integer
+                    - type: number
+                    - type: boolean
+                    - type: 'null'
+              - type: object
+                additionalProperties:
+                  oneOf:
+                    - type: string
+                    - type: integer
+                    - type: number
+                    - type: boolean
+                    - type: 'null'
+      additionalProperties: false
+      required:
+        - call_id
+        - tool_name
+        - arguments
+    ToolDefinition:
+      type: object
+      properties:
+        tool_name:
+          oneOf:
+            - type: string
+              enum:
+                - brave_search
+                - wolfram_alpha
+                - photogen
+                - code_interpreter
+            - type: string
+        description:
+          type: string
+        parameters:
+          type: object
+          additionalProperties:
+            $ref: '#/components/schemas/ToolParamDefinition'
+      additionalProperties: false
+      required:
+        - tool_name
+    ToolParamDefinition:
+      type: object
+      properties:
+        param_type:
+          type: string
+        description:
+          type: string
+        required:
+          type: boolean
+          default: true
+        default:
+          oneOf:
+            - type: 'null'
+            - type: boolean
+            - type: number
+            - type: string
+            - type: array
+            - type: object
+      additionalProperties: false
+      required:
+        - param_type
+    ToolResponseMessage:
+      type: object
+      properties:
+        role:
+          type: string
+          const: tool
+          default: tool
+          description: >-
+            Must be "tool" to identify this as a tool response
+        call_id:
           type: string
+          description: >-
+            Unique identifier for the tool call this response is for
+        tool_name:
+          oneOf:
+            - type: string
+              enum:
+                - brave_search
+                - wolfram_alpha
+                - photogen
+                - code_interpreter
+            - type: string
+          description: Name of the tool that was called
+        content:
+          $ref: '#/components/schemas/InterleavedContent'
+          description: The response content from the tool
       additionalProperties: false
       required:
-        - uri
+        - role
+        - call_id
+        - tool_name
+        - content
+      description: >-
+        A message representing the result of a tool invocation.
     UserMessage:
       type: object
       properties:
@@ -2063,46 +2711,6 @@ components:
       additionalProperties: false
       required:
         - job_uuid
-    ToolConfig:
-      type: object
-      properties:
-        tool_choice:
-          type: string
-          enum:
-            - auto
-            - required
-          description: >-
-            (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
-          default: auto
-        tool_prompt_format:
-          type: string
-          enum:
-            - json
-            - function_tag
-            - python_list
-          description: >-
-            (Optional) Instructs the model how to format tool calls. By default, Llama
-            Stack will attempt to use a format that is best adapted to the model.
-            - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
-            - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name>
-            tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
-            syntax -- a list of function calls.
-        system_message_behavior:
-          type: string
-          enum:
-            - append
-            - replace
-          description: >-
-            (Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`:
-            Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`:
-            Replaces the default system prompt with the provided system message. The
-            system message can include the string '{{function_definitions}}' to indicate
-            where the function definitions should be inserted.
-          default: append
-      additionalProperties: false
-      required:
-        - system_message_behavior
-      description: Configuration for tool use.
     ChatCompletionRequest:
       type: object
       properties:
@@ -2251,238 +2859,111 @@ components:
           type: string
           contentEncoding: base64
       additionalProperties: false
-      required:
-        - type
-        - image
-    TextDelta:
-      type: object
-      properties:
-        type:
-          type: string
-          const: text
-          default: text
-        text:
-          type: string
-      additionalProperties: false
-      required:
-        - type
-        - text
-    ToolCallDelta:
-      type: object
-      properties:
-        type:
-          type: string
-          const: tool_call
-          default: tool_call
-        tool_call:
-          oneOf:
-            - type: string
-            - $ref: '#/components/schemas/ToolCall'
-        parse_status:
-          type: string
-          enum:
-            - started
-            - in_progress
-            - failed
-            - succeeded
-      additionalProperties: false
-      required:
-        - type
-        - tool_call
-        - parse_status
-    CompletionRequest:
-      type: object
-      properties:
-        model_id:
-          type: string
-          description: >-
-            The identifier of the model to use. The model must be registered with
-            Llama Stack and available via the /models endpoint.
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: The content to generate a completion for
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: >-
-            (Optional) Parameters to control the sampling strategy
-        response_format:
-          $ref: '#/components/schemas/ResponseFormat'
-          description: >-
-            (Optional) Grammar specification for guided (structured) decoding
-        stream:
-          type: boolean
-          description: >-
-            (Optional) If True, generate an SSE event stream of the response. Defaults
-            to False.
-        logprobs:
-          type: object
-          properties:
-            top_k:
-              type: integer
-              default: 0
-              description: >-
-                How many tokens (for each position) to return log probabilities for.
-          additionalProperties: false
-          description: >-
-            (Optional) If specified, log probabilities for each token position will
-            be returned.
-      additionalProperties: false
-      required:
-        - model_id
-        - content
-    CompletionResponseStreamChunk:
-      type: object
-      properties:
-        delta:
-          type: string
-          description: >-
-            New content generated since last chunk. This can be one or more tokens.
-        stop_reason:
-          type: string
-          enum:
-            - end_of_turn
-            - end_of_message
-            - out_of_tokens
-          description: >-
-            Optional reason why generation stopped, if complete
-        logprobs:
-          type: array
-          items:
-            $ref: '#/components/schemas/TokenLogProbs'
-          description: >-
-            Optional log probabilities for generated tokens
-      additionalProperties: false
-      required:
-        - delta
-      description: >-
-        A chunk of a streamed completion response.
-    AgentConfig:
-      type: object
-      properties:
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-        input_shields:
-          type: array
-          items:
-            type: string
-        output_shields:
-          type: array
-          items:
-            type: string
-        toolgroups:
-          type: array
-          items:
-            $ref: '#/components/schemas/AgentTool'
-        client_tools:
-          type: array
-          items:
-            $ref: '#/components/schemas/ToolDef'
-        tool_choice:
-          type: string
-          enum:
-            - auto
-            - required
-          description: >-
-            Whether tool use is required or automatic. This is a hint to the model
-            which may not be followed. It depends on the Instruction Following capabilities
-            of the model.
-        tool_prompt_format:
-          type: string
-          enum:
-            - json
-            - function_tag
-            - python_list
-          description: >-
-            Prompt format for calling custom / zero shot tools.
-        tool_config:
-          $ref: '#/components/schemas/ToolConfig'
-        max_infer_iters:
-          type: integer
-          default: 10
-        model:
-          type: string
-        instructions:
-          type: string
-        enable_session_persistence:
-          type: boolean
-        response_format:
-          $ref: '#/components/schemas/ResponseFormat'
-      additionalProperties: false
-      required:
-        - model
-        - instructions
-        - enable_session_persistence
-    AgentTool:
-      oneOf:
-        - type: string
-        - type: object
-          properties:
-            name:
-              type: string
-            args:
-              type: object
-              additionalProperties:
-                oneOf:
-                  - type: 'null'
-                  - type: boolean
-                  - type: number
-                  - type: string
-                  - type: array
-                  - type: object
-          additionalProperties: false
-          required:
-            - name
-            - args
-    ToolDef:
+      required:
+        - type
+        - image
+    TextDelta:
       type: object
       properties:
-        name:
+        type:
           type: string
-        description:
+          const: text
+          default: text
+        text:
           type: string
-        parameters:
-          type: array
-          items:
-            $ref: '#/components/schemas/ToolParameter'
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
       additionalProperties: false
       required:
-        - name
-    ToolParameter:
+        - type
+        - text
+    ToolCallDelta:
       type: object
       properties:
-        name:
+        type:
           type: string
-        parameter_type:
+          const: tool_call
+          default: tool_call
+        tool_call:
+          oneOf:
+            - type: string
+            - $ref: '#/components/schemas/ToolCall'
+        parse_status:
           type: string
-        description:
+          enum:
+            - started
+            - in_progress
+            - failed
+            - succeeded
+      additionalProperties: false
+      required:
+        - type
+        - tool_call
+        - parse_status
+    CompletionRequest:
+      type: object
+      properties:
+        model_id:
           type: string
-        required:
+          description: >-
+            The identifier of the model to use. The model must be registered with
+            Llama Stack and available via the /models endpoint.
+        content:
+          $ref: '#/components/schemas/InterleavedContent'
+          description: The content to generate a completion for
+        sampling_params:
+          $ref: '#/components/schemas/SamplingParams'
+          description: >-
+            (Optional) Parameters to control the sampling strategy
+        response_format:
+          $ref: '#/components/schemas/ResponseFormat'
+          description: >-
+            (Optional) Grammar specification for guided (structured) decoding
+        stream:
           type: boolean
-          default: true
-        default:
-          oneOf:
-            - type: 'null'
-            - type: boolean
-            - type: number
-            - type: string
-            - type: array
-            - type: object
+          description: >-
+            (Optional) If True, generate an SSE event stream of the response. Defaults
+            to False.
+        logprobs:
+          type: object
+          properties:
+            top_k:
+              type: integer
+              default: 0
+              description: >-
+                How many tokens (for each position) to return log probabilities for.
+          additionalProperties: false
+          description: >-
+            (Optional) If specified, log probabilities for each token position will
+            be returned.
       additionalProperties: false
       required:
-        - name
-        - parameter_type
-        - description
-        - required
+        - model_id
+        - content
+    CompletionResponseStreamChunk:
+      type: object
+      properties:
+        delta:
+          type: string
+          description: >-
+            New content generated since last chunk. This can be one or more tokens.
+        stop_reason:
+          type: string
+          enum:
+            - end_of_turn
+            - end_of_message
+            - out_of_tokens
+          description: >-
+            Optional reason why generation stopped, if complete
+        logprobs:
+          type: array
+          items:
+            $ref: '#/components/schemas/TokenLogProbs'
+          description: >-
+            Optional log probabilities for generated tokens
+      additionalProperties: false
+      required:
+        - delta
+      description: >-
+        A chunk of a streamed completion response.
     CreateAgentRequest:
       type: object
       properties:
@@ -2893,232 +3374,75 @@ components:
       type: object
       properties:
         event:
-          $ref: '#/components/schemas/AgentTurnResponseEvent'
-      additionalProperties: false
-      required:
-        - event
-      description: streamed agent turn completion response.
-    AgentTurnResponseTurnCompletePayload:
-      type: object
-      properties:
-        event_type:
-          type: string
-          const: turn_complete
-          default: turn_complete
-        turn:
-          $ref: '#/components/schemas/Turn'
-      additionalProperties: false
-      required:
-        - event_type
-        - turn
-    AgentTurnResponseTurnStartPayload:
-      type: object
-      properties:
-        event_type:
-          type: string
-          const: turn_start
-          default: turn_start
-        turn_id:
-          type: string
-      additionalProperties: false
-      required:
-        - event_type
-        - turn_id
-    EmbeddingsRequest:
-      type: object
-      properties:
-        model_id:
-          type: string
-          description: >-
-            The identifier of the model to use. The model must be an embedding model
-            registered with Llama Stack and available via the /models endpoint.
-        contents:
-          type: array
-          items:
-            $ref: '#/components/schemas/InterleavedContent'
-          description: >-
-            List of contents to generate embeddings for. Note that content can be
-            multimodal. The behavior depends on the model and provider. Some models
-            may only support text.
-      additionalProperties: false
-      required:
-        - model_id
-        - contents
-    EmbeddingsResponse:
-      type: object
-      properties:
-        embeddings:
-          type: array
-          items:
-            type: array
-            items:
-              type: number
-          description: >-
-            List of embedding vectors, one per input content. Each embedding is a
-            list of floats. The dimensionality of the embedding is model-specific;
-            you can check model metadata using /models/{model_id}
-      additionalProperties: false
-      required:
-        - embeddings
-      description: >-
-        Response containing generated embeddings.
-    AgentCandidate:
-      type: object
-      properties:
-        type:
-          type: string
-          const: agent
-          default: agent
-        config:
-          $ref: '#/components/schemas/AgentConfig'
-      additionalProperties: false
-      required:
-        - type
-        - config
-    AggregationFunctionType:
-      type: string
-      enum:
-        - average
-        - median
-        - categorical_count
-        - accuracy
-    AppEvalTaskConfig:
-      type: object
-      properties:
-        type:
-          type: string
-          const: app
-          default: app
-        eval_candidate:
-          $ref: '#/components/schemas/EvalCandidate'
-        scoring_params:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringFnParams'
-        num_examples:
-          type: integer
-      additionalProperties: false
-      required:
-        - type
-        - eval_candidate
-        - scoring_params
-    BasicScoringFnParams:
-      type: object
-      properties:
-        type:
-          type: string
-          const: basic
-          default: basic
-        aggregation_functions:
-          type: array
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-      additionalProperties: false
-      required:
-        - type
-    BenchmarkEvalTaskConfig:
-      type: object
-      properties:
-        type:
-          type: string
-          const: benchmark
-          default: benchmark
-        eval_candidate:
-          $ref: '#/components/schemas/EvalCandidate'
-        num_examples:
-          type: integer
+          $ref: '#/components/schemas/AgentTurnResponseEvent'
       additionalProperties: false
       required:
-        - type
-        - eval_candidate
-    EvalCandidate:
-      oneOf:
-        - $ref: '#/components/schemas/ModelCandidate'
-        - $ref: '#/components/schemas/AgentCandidate'
-      discriminator:
-        propertyName: type
-        mapping:
-          model: '#/components/schemas/ModelCandidate'
-          agent: '#/components/schemas/AgentCandidate'
-    EvalTaskConfig:
-      oneOf:
-        - $ref: '#/components/schemas/BenchmarkEvalTaskConfig'
-        - $ref: '#/components/schemas/AppEvalTaskConfig'
-      discriminator:
-        propertyName: type
-        mapping:
-          benchmark: '#/components/schemas/BenchmarkEvalTaskConfig'
-          app: '#/components/schemas/AppEvalTaskConfig'
-    LLMAsJudgeScoringFnParams:
+        - event
+      description: streamed agent turn completion response.
+    AgentTurnResponseTurnCompletePayload:
       type: object
       properties:
-        type:
-          type: string
-          const: llm_as_judge
-          default: llm_as_judge
-        judge_model:
-          type: string
-        prompt_template:
+        event_type:
           type: string
-        judge_score_regexes:
-          type: array
-          items:
-            type: string
-        aggregation_functions:
-          type: array
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
+          const: turn_complete
+          default: turn_complete
+        turn:
+          $ref: '#/components/schemas/Turn'
       additionalProperties: false
       required:
-        - type
-        - judge_model
-    ModelCandidate:
+        - event_type
+        - turn
+    AgentTurnResponseTurnStartPayload:
       type: object
       properties:
-        type:
+        event_type:
           type: string
-          const: model
-          default: model
-        model:
+          const: turn_start
+          default: turn_start
+        turn_id:
           type: string
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-        system_message:
-          $ref: '#/components/schemas/SystemMessage'
       additionalProperties: false
       required:
-        - type
-        - model
-        - sampling_params
-    RegexParserScoringFnParams:
+        - event_type
+        - turn_id
+    EmbeddingsRequest:
       type: object
       properties:
-        type:
+        model_id:
           type: string
-          const: regex_parser
-          default: regex_parser
-        parsing_regexes:
+          description: >-
+            The identifier of the model to use. The model must be an embedding model
+            registered with Llama Stack and available via the /models endpoint.
+        contents:
           type: array
           items:
-            type: string
-        aggregation_functions:
+            $ref: '#/components/schemas/InterleavedContent'
+          description: >-
+            List of contents to generate embeddings for. Note that content can be
+            multimodal. The behavior depends on the model and provider. Some models
+            may only support text.
+      additionalProperties: false
+      required:
+        - model_id
+        - contents
+    EmbeddingsResponse:
+      type: object
+      properties:
+        embeddings:
           type: array
           items:
-            $ref: '#/components/schemas/AggregationFunctionType'
+            type: array
+            items:
+              type: number
+          description: >-
+            List of embedding vectors, one per input content. Each embedding is a
+            list of floats. The dimensionality of the embedding is model-specific;
+            you can check model metadata using /models/{model_id}
       additionalProperties: false
       required:
-        - type
-    ScoringFnParams:
-      oneOf:
-        - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-        - $ref: '#/components/schemas/RegexParserScoringFnParams'
-        - $ref: '#/components/schemas/BasicScoringFnParams'
-      discriminator:
-        propertyName: type
-        mapping:
-          llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-          regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-          basic: '#/components/schemas/BasicScoringFnParams'
+        - embeddings
+      description: >-
+        Response containing generated embeddings.
     EvaluateRowsRequest:
       type: object
       properties:
@@ -3139,64 +3463,12 @@ components:
           items:
             type: string
         task_config:
-          $ref: '#/components/schemas/EvalTaskConfig'
+          $ref: '#/components/schemas/BenchmarkConfig'
       additionalProperties: false
       required:
         - input_rows
         - scoring_functions
         - task_config
-    EvaluateResponse:
-      type: object
-      properties:
-        generations:
-          type: array
-          items:
-            type: object
-            additionalProperties:
-              oneOf:
-                - type: 'null'
-                - type: boolean
-                - type: number
-                - type: string
-                - type: array
-                - type: object
-        scores:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-      additionalProperties: false
-      required:
-        - generations
-        - scores
-    ScoringResult:
-      type: object
-      properties:
-        score_rows:
-          type: array
-          items:
-            type: object
-            additionalProperties:
-              oneOf:
-                - type: 'null'
-                - type: boolean
-                - type: number
-                - type: string
-                - type: array
-                - type: object
-        aggregated_results:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-      additionalProperties: false
-      required:
-        - score_rows
-        - aggregated_results
     Session:
       type: object
       properties:
@@ -3401,44 +3673,6 @@ components:
       additionalProperties: false
       required:
         - type
-    EvalTask:
-      type: object
-      properties:
-        identifier:
-          type: string
-        provider_resource_id:
-          type: string
-        provider_id:
-          type: string
-        type:
-          type: string
-          const: eval_task
-          default: eval_task
-        dataset_id:
-          type: string
-        scoring_functions:
-          type: array
-          items:
-            type: string
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-      additionalProperties: false
-      required:
-        - identifier
-        - provider_resource_id
-        - provider_id
-        - type
-        - dataset_id
-        - scoring_functions
-        - metadata
     Model:
       type: object
       properties:
@@ -3766,13 +4000,6 @@ components:
         - job_uuid
         - checkpoints
       description: Artifacts of a finetuning job.
-    JobStatus:
-      type: string
-      enum:
-        - completed
-        - in_progress
-        - failed
-        - scheduled
     PostTrainingJobStatusResponse:
       type: object
       properties:
@@ -3977,16 +4204,6 @@ components:
       additionalProperties: false
       required:
         - data
-    ListEvalTasksResponse:
-      type: object
-      properties:
-        data:
-          type: array
-          items:
-            $ref: '#/components/schemas/EvalTask'
-      additionalProperties: false
-      required:
-        - data
     ListModelsResponse:
       type: object
       properties:
@@ -4569,18 +4786,18 @@ components:
       additionalProperties: false
       required:
         - data
-    RegisterDatasetRequest:
+    RegisterBenchmarkRequest:
       type: object
       properties:
+        benchmark_id:
+          type: string
         dataset_id:
           type: string
-        dataset_schema:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ParamType'
-        url:
-          $ref: '#/components/schemas/URL'
-        provider_dataset_id:
+        scoring_functions:
+          type: array
+          items:
+            type: string
+        provider_benchmark_id:
           type: string
         provider_id:
           type: string
@@ -4596,21 +4813,21 @@ components:
               - type: object
       additionalProperties: false
       required:
+        - benchmark_id
         - dataset_id
-        - dataset_schema
-        - url
-    RegisterEvalTaskRequest:
+        - scoring_functions
+    RegisterDatasetRequest:
       type: object
       properties:
-        eval_task_id:
-          type: string
         dataset_id:
           type: string
-        scoring_functions:
-          type: array
-          items:
-            type: string
-        provider_eval_task_id:
+        dataset_schema:
+          type: object
+          additionalProperties:
+            $ref: '#/components/schemas/ParamType'
+        url:
+          $ref: '#/components/schemas/URL'
+        provider_dataset_id:
           type: string
         provider_id:
           type: string
@@ -4626,9 +4843,9 @@ components:
               - type: object
       additionalProperties: false
       required:
-        - eval_task_id
         - dataset_id
-        - scoring_functions
+        - dataset_schema
+        - url
     RegisterModelRequest:
       type: object
       properties:
@@ -4739,18 +4956,10 @@ components:
       type: object
       properties:
         task_config:
-          $ref: '#/components/schemas/EvalTaskConfig'
+          $ref: '#/components/schemas/BenchmarkConfig'
       additionalProperties: false
       required:
         - task_config
-    Job:
-      type: object
-      properties:
-        job_id:
-          type: string
-      additionalProperties: false
-      required:
-        - job_id
     RunShieldRequest:
       type: object
       properties:
@@ -5049,10 +5258,10 @@ tags:
     x-displayName: >-
       Agents API for creating and interacting with agentic systems.
   - name: BatchInference (Coming Soon)
+  - name: Benchmarks
   - name: DatasetIO
   - name: Datasets
   - name: Eval
-  - name: EvalTasks
   - name: Inference
     description: >-
       This API provides the raw interface to the underlying models. Two kinds of models
@@ -5083,10 +5292,10 @@ x-tagGroups:
     tags:
       - Agents
       - BatchInference (Coming Soon)
+      - Benchmarks
       - DatasetIO
       - Datasets
       - Eval
-      - EvalTasks
       - Inference
       - Inspect
       - Models
diff --git a/docs/getting_started.ipynb b/docs/getting_started.ipynb
index abe537c8e1..ee616b4716 100644
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
@@ -324,7 +324,7 @@
               "- vector_io\n",
               "container_image: null\n",
               "datasets: <span style=\"font-weight: bold\">[]</span>\n",
-              "eval_tasks: <span style=\"font-weight: bold\">[]</span>\n",
+              "benchmarks: <span style=\"font-weight: bold\">[]</span>\n",
               "image_name: together\n",
               "metadata_store:\n",
               "  db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/ashwin/.llama/distributions/together/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">registry.db</span>\n",
@@ -508,7 +508,7 @@
               "- vector_io\n",
               "container_image: null\n",
               "datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
-              "eval_tasks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
+              "benchmarks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
               "image_name: together\n",
               "metadata_store:\n",
               "  db_path: \u001b[35m/Users/ashwin/.llama/distributions/together/\u001b[0m\u001b[95mregistry.db\u001b[0m\n",
diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
index 84da252469..8eecf84abb 100644
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@@ -370,7 +370,7 @@
               "- tool_runtime\n",
               "datasets: <span style=\"font-weight: bold\">[]</span>\n",
               "container_image: null\n",
-              "eval_tasks: <span style=\"font-weight: bold\">[]</span>\n",
+              "benchmarks: <span style=\"font-weight: bold\">[]</span>\n",
               "image_name: together\n",
               "memory_banks: <span style=\"font-weight: bold\">[]</span>\n",
               "metadata_store:\n",
@@ -551,7 +551,7 @@
               "- tool_runtime\n",
               "datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
               "container_image: null\n",
-              "eval_tasks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
+              "benchmarks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
               "image_name: together\n",
               "memory_banks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
               "metadata_store:\n",
diff --git a/docs/openapi_generator/pyopenapi/generator.py b/docs/openapi_generator/pyopenapi/generator.py
index a0385cae00..0f3b997848 100644
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@@ -647,6 +647,7 @@ def _build_operation(self, op: EndpointOperation) -> Operation:
         description = "\n".join(
             filter(None, [doc_string.short_description, doc_string.long_description])
         )
+
         return Operation(
             tags=[op.defining_class.__name__],
             summary=None,
@@ -656,6 +657,7 @@ def _build_operation(self, op: EndpointOperation) -> Operation:
             requestBody=requestBody,
             responses=responses,
             callbacks=callbacks,
+            deprecated=True if "DEPRECATED" in op.func_name else None,
             security=[] if op.public else None,
         )
 
diff --git a/docs/openapi_generator/pyopenapi/specification.py b/docs/openapi_generator/pyopenapi/specification.py
index 4b54295c56..f96de58b69 100644
--- a/docs/openapi_generator/pyopenapi/specification.py
+++ b/docs/openapi_generator/pyopenapi/specification.py
@@ -117,6 +117,7 @@ class Operation:
     requestBody: Optional[RequestBody] = None
     callbacks: Optional[Dict[str, "Callback"]] = None
     security: Optional[List["SecurityRequirement"]] = None
+    deprecated: Optional[bool] = None
 
 
 @dataclass
diff --git a/docs/source/building_applications/evals.md b/docs/source/building_applications/evals.md
index c4cb476e4f..f28e0d5fd7 100644
--- a/docs/source/building_applications/evals.md
+++ b/docs/source/building_applications/evals.md
@@ -41,14 +41,14 @@ system_message = {
     "content": SYSTEM_PROMPT_TEMPLATE,
 }
 
-client.eval_tasks.register(
-    eval_task_id="meta-reference::mmmu",
+client.benchmarks.register(
+    benchmark_id="meta-reference::mmmu",
     dataset_id=f"mmmu-{subset}-{split}",
     scoring_functions=["basic::regex_parser_multiple_choice_answer"],
 )
 
 response = client.eval.evaluate_rows(
-    task_id="meta-reference::mmmu",
+    benchmark_id="meta-reference::mmmu",
     input_rows=eval_rows,
     scoring_functions=["basic::regex_parser_multiple_choice_answer"],
     task_config={
@@ -99,14 +99,14 @@ eval_rows = client.datasetio.get_rows_paginated(
 ```
 
 ```python
-client.eval_tasks.register(
-    eval_task_id="meta-reference::simpleqa",
+client.benchmarks.register(
+    benchmark_id="meta-reference::simpleqa",
     dataset_id=simpleqa_dataset_id,
     scoring_functions=["llm-as-judge::405b-simpleqa"],
 )
 
 response = client.eval.evaluate_rows(
-    task_id="meta-reference::simpleqa",
+    benchmark_id="meta-reference::simpleqa",
     input_rows=eval_rows.rows,
     scoring_functions=["llm-as-judge::405b-simpleqa"],
     task_config={
@@ -156,7 +156,7 @@ agent_config = {
 }
 
 response = client.eval.evaluate_rows(
-    task_id="meta-reference::simpleqa",
+    benchmark_id="meta-reference::simpleqa",
     input_rows=eval_rows.rows,
     scoring_functions=["llm-as-judge::405b-simpleqa"],
     task_config={
diff --git a/docs/source/building_applications/evaluation.md b/docs/source/building_applications/evaluation.md
index 91e5c552bd..ad220f7518 100644
--- a/docs/source/building_applications/evaluation.md
+++ b/docs/source/building_applications/evaluation.md
@@ -10,15 +10,15 @@ Here's how to set up basic evaluation:
 
 ```python
 # Create an evaluation task
-response = client.eval_tasks.register(
-    eval_task_id="my_eval",
+response = client.benchmarks.register(
+    benchmark_id="my_eval",
     dataset_id="my_dataset",
     scoring_functions=["accuracy", "relevance"],
 )
 
 # Run evaluation
 job = client.eval.run_eval(
-    task_id="my_eval",
+    benchmark_id="my_eval",
     task_config={
         "type": "app",
         "eval_candidate": {"type": "agent", "config": agent_config},
@@ -26,5 +26,5 @@ job = client.eval.run_eval(
 )
 
 # Get results
-result = client.eval.job_result(task_id="my_eval", job_id=job.job_id)
+result = client.eval.job_result(benchmark_id="my_eval", job_id=job.job_id)
 ```
diff --git a/docs/source/concepts/evaluation_concepts.md b/docs/source/concepts/evaluation_concepts.md
index 399d99d92d..3ca4b0ac8e 100644
--- a/docs/source/concepts/evaluation_concepts.md
+++ b/docs/source/concepts/evaluation_concepts.md
@@ -5,7 +5,7 @@ The Llama Stack Evaluation flow allows you to run evaluations on your GenAI appl
 We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications.
 - `/datasetio` + `/datasets` API
 - `/scoring` + `/scoring_functions` API
-- `/eval` + `/eval_tasks` API
+- `/eval` + `/benchmarks` API
 
 This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).
 
@@ -21,7 +21,7 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo
 - **Scoring**: evaluate outputs of the system.
   - Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics.
 - **Eval**: generate outputs (via Inference or Agents) and perform scoring.
-  - Associated with `EvalTask` resource.
+  - Associated with `Benchmark` resource.
 
 
 Use the following decision tree to decide how to use LlamaStack Evaluation flow.
diff --git a/docs/source/concepts/index.md b/docs/source/concepts/index.md
index 1437ec6232..403e47c489 100644
--- a/docs/source/concepts/index.md
+++ b/docs/source/concepts/index.md
@@ -42,7 +42,7 @@ Some of these APIs are associated with a set of **Resources**. Here is the mappi
 - **Tool Runtime** is associated with `ToolGroup` resources.
 - **DatasetIO** is associated with `Dataset` resources.
 - **Scoring** is associated with `ScoringFunction` resources.
-- **Eval** is associated with `Model` and `EvalTask` resources.
+- **Eval** is associated with `Model` and `Benchmark` resources.
 
 Furthermore, we allow these resources to be **federated** across multiple providers. For example, you may have some Llama models served by Fireworks while others are served by AWS Bedrock. Regardless, they will all work seamlessly with the same uniform Inference API provided by Llama Stack.
 
diff --git a/docs/source/playground/index.md b/docs/source/playground/index.md
index d74bf1a03b..9691609abf 100644
--- a/docs/source/playground/index.md
+++ b/docs/source/playground/index.md
@@ -64,7 +64,7 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie
     ```
 
     ```bash
-    $ llama-stack-client eval_tasks register \
+    $ llama-stack-client benchmarks register \
     --eval-task-id meta-reference-mmlu \
     --provider-id meta-reference \
     --dataset-id mmlu \
@@ -86,7 +86,7 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie
   - Under the hood, it uses Llama Stack's `/providers` API to get information about the providers.
 
 - **API Resources**: Inspect Llama Stack API resources
-  - This page allows you to inspect Llama Stack API resources (`models`, `datasets`, `memory_banks`, `eval_tasks`, `shields`).
+  - This page allows you to inspect Llama Stack API resources (`models`, `datasets`, `memory_banks`, `benchmarks`, `shields`).
   - Under the hood, it uses Llama Stack's `/<resources>/list` API to get information about each resources.
   - Please visit [Core Concepts](https://llama-stack.readthedocs.io/en/latest/concepts/index.html) for more details about the resources.
 
diff --git a/docs/source/references/evals_reference/index.md b/docs/source/references/evals_reference/index.md
index 86f66208af..71dbb47e59 100644
--- a/docs/source/references/evals_reference/index.md
+++ b/docs/source/references/evals_reference/index.md
@@ -5,7 +5,7 @@ The Llama Stack Evaluation flow allows you to run evaluations on your GenAI appl
 We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications.
 - `/datasetio` + `/datasets` API
 - `/scoring` + `/scoring_functions` API
-- `/eval` + `/eval_tasks` API
+- `/eval` + `/benchmarks` API
 
 This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).
 
@@ -21,7 +21,7 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo
 - **Scoring**: evaluate outputs of the system.
   - Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics.
 - **Eval**: generate outputs (via Inference or Agents) and perform scoring.
-  - Associated with `EvalTask` resource.
+  - Associated with `Benchmark` resource.
 
 
 Use the following decision tree to decide how to use LlamaStack Evaluation flow.
@@ -77,14 +77,14 @@ system_message = {
     "content": SYSTEM_PROMPT_TEMPLATE,
 }
 
-client.eval_tasks.register(
-    eval_task_id="meta-reference::mmmu",
+client.benchmarks.register(
+    benchmark_id="meta-reference::mmmu",
     dataset_id=f"mmmu-{subset}-{split}",
     scoring_functions=["basic::regex_parser_multiple_choice_answer"],
 )
 
 response = client.eval.evaluate_rows(
-    task_id="meta-reference::mmmu",
+    benchmark_id="meta-reference::mmmu",
     input_rows=eval_rows,
     scoring_functions=["basic::regex_parser_multiple_choice_answer"],
     task_config={
@@ -135,14 +135,14 @@ eval_rows = client.datasetio.get_rows_paginated(
 ```
 
 ```python
-client.eval_tasks.register(
-    eval_task_id="meta-reference::simpleqa",
+client.benchmarks.register(
+    benchmark_id="meta-reference::simpleqa",
     dataset_id=simpleqa_dataset_id,
     scoring_functions=["llm-as-judge::405b-simpleqa"],
 )
 
 response = client.eval.evaluate_rows(
-    task_id="meta-reference::simpleqa",
+    benchmark_id="meta-reference::simpleqa",
     input_rows=eval_rows.rows,
     scoring_functions=["llm-as-judge::405b-simpleqa"],
     task_config={
@@ -192,7 +192,7 @@ agent_config = {
 }
 
 response = client.eval.evaluate_rows(
-    task_id="meta-reference::simpleqa",
+    benchmark_id="meta-reference::simpleqa",
     input_rows=eval_rows.rows,
     scoring_functions=["llm-as-judge::405b-simpleqa"],
     task_config={
@@ -281,7 +281,7 @@ The following examples give the quick steps to start running evaluations using t
 
 #### Benchmark Evaluation CLI
 Usage: There are 2 inputs necessary for running a benchmark eval
-- `eval-task-id`: the identifier associated with the eval task. Each `EvalTask` is parametrized by
+- `eval-task-id`: the identifier associated with the eval task. Each `Benchmark` is parametrized by
   - `dataset_id`: the identifier associated with the dataset.
   - `List[scoring_function_id]`: list of scoring function identifiers.
 - `eval-task-config`: specifies the configuration of the model / agent to evaluate on.
@@ -289,7 +289,7 @@ Usage: There are 2 inputs necessary for running a benchmark eval
 
 ```
 llama-stack-client eval run_benchmark <eval-task-id> \
---eval-task-config ~/eval_task_config.json \
+--eval-task-config ~/benchmark_config.json \
 --visualize
 ```
 
@@ -309,15 +309,15 @@ llama-stack-client eval run_scoring <scoring_fn_id_1> <scoring_fn_id_2> ... <sco
 --output-dir ./
 ```
 
-#### Defining EvalTaskConfig
-The `EvalTaskConfig` are user specified config to define:
+#### Defining BenchmarkConfig
+The `BenchmarkConfig` are user specified config to define:
 1. `EvalCandidate` to run generation on:
    - `ModelCandidate`: The model will be used for generation through LlamaStack /inference API.
    - `AgentCandidate`: The agentic system specified by AgentConfig will be used for generation through LlamaStack  /agents API.
 2. Optionally scoring function params to allow customization of scoring function behaviour. This is useful to parameterize generic scoring functions such as LLMAsJudge with custom `judge_model` / `judge_prompt`.
 
 
-**Example Benchmark EvalTaskConfig**
+**Example Benchmark BenchmarkConfig**
 ```json
 {
     "type": "benchmark",
@@ -335,7 +335,7 @@ The `EvalTaskConfig` are user specified config to define:
 }
 ```
 
-**Example Application EvalTaskConfig**
+**Example Application BenchmarkConfig**
 ```json
 {
     "type": "app",
diff --git a/docs/source/references/llama_stack_client_cli_reference.md b/docs/source/references/llama_stack_client_cli_reference.md
index b1fb7014f8..d459726cb6 100644
--- a/docs/source/references/llama_stack_client_cli_reference.md
+++ b/docs/source/references/llama_stack_client_cli_reference.md
@@ -161,14 +161,14 @@ Options:
 
 ## Eval Task Management
 
-### `llama-stack-client eval_tasks list`
+### `llama-stack-client benchmarks list`
 ```bash
-$ llama-stack-client eval_tasks list
+$ llama-stack-client benchmarks list
 ```
 
-### `llama-stack-client eval_tasks register`
+### `llama-stack-client benchmarks register`
 ```bash
-$ llama-stack-client eval_tasks register --eval-task-id <eval-task-id> --dataset-id <dataset-id> --scoring-functions <function1> [<function2> ...] [--provider-id <provider-id>] [--provider-eval-task-id <provider-eval-task-id>] [--metadata <metadata>]
+$ llama-stack-client benchmarks register --eval-task-id <eval-task-id> --dataset-id <dataset-id> --scoring-functions <function1> [<function2> ...] [--provider-id <provider-id>] [--provider-eval-task-id <provider-eval-task-id>] [--metadata <metadata>]
 ```
 
 Options:
@@ -191,7 +191,7 @@ Options:
 - `--num-examples`: Optional. Number of examples to evaluate (useful for debugging)
 - `--visualize`: Optional flag. If set, visualizes evaluation results after completion
 
-Example eval_task_config.json:
+Example benchmark_config.json:
 ```json
 {
     "type": "benchmark",
diff --git a/docs/source/references/python_sdk_reference/index.md b/docs/source/references/python_sdk_reference/index.md
index 8a06e22442..9d1130422f 100644
--- a/docs/source/references/python_sdk_reference/index.md
+++ b/docs/source/references/python_sdk_reference/index.md
@@ -181,8 +181,8 @@ from llama_stack_client.types import EvaluateResponse, Job
 
 Methods:
 
-- <code title="post /v1/eval/tasks/{task_id}/evaluations">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">evaluate_rows</a>(task_id, \*\*<a href="src/llama_stack_client/types/eval_evaluate_rows_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
-- <code title="post /v1/eval/tasks/{task_id}/jobs">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">run_eval</a>(task_id, \*\*<a href="src/llama_stack_client/types/eval_run_eval_params.py">params</a>) -> <a href="./src/llama_stack_client/types/job.py">Job</a></code>
+- <code title="post /v1/eval/tasks/{benchmark_id}/evaluations">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">evaluate_rows</a>(benchmark_id, \*\*<a href="src/llama_stack_client/types/eval_evaluate_rows_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
+- <code title="post /v1/eval/tasks/{benchmark_id}/jobs">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">run_eval</a>(benchmark_id, \*\*<a href="src/llama_stack_client/types/eval_run_eval_params.py">params</a>) -> <a href="./src/llama_stack_client/types/job.py">Job</a></code>
 
 ### Jobs
 
@@ -194,9 +194,9 @@ from llama_stack_client.types.eval import JobStatusResponse
 
 Methods:
 
-- <code title="get /v1/eval/tasks/{task_id}/jobs/{job_id}/result">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">retrieve</a>(job_id, \*, task_id) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
-- <code title="delete /v1/eval/tasks/{task_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">cancel</a>(job_id, \*, task_id) -> None</code>
-- <code title="get /v1/eval/tasks/{task_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">status</a>(job_id, \*, task_id) -> Optional[JobStatusResponse]</code>
+- <code title="get /v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">retrieve</a>(job_id, \*, benchmark_id) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
+- <code title="delete /v1/eval/tasks/{benchmark_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">cancel</a>(job_id, \*, benchmark_id) -> None</code>
+- <code title="get /v1/eval/tasks/{benchmark_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">status</a>(job_id, \*, benchmark_id) -> Optional[JobStatusResponse]</code>
 
 ## Inspect
 
@@ -443,20 +443,20 @@ Methods:
 - <code title="get /v1/scoring-functions">client.scoring_functions.<a href="./src/llama_stack_client/resources/scoring_functions.py">list</a>() -> <a href="./src/llama_stack_client/types/scoring_function_list_response.py">ScoringFunctionListResponse</a></code>
 - <code title="post /v1/scoring-functions">client.scoring_functions.<a href="./src/llama_stack_client/resources/scoring_functions.py">register</a>(\*\*<a href="src/llama_stack_client/types/scoring_function_register_params.py">params</a>) -> None</code>
 
-## EvalTasks
+## Benchmarks
 
 Types:
 
 ```python
 from llama_stack_client.types import (
-    EvalTask,
-    ListEvalTasksResponse,
-    EvalTaskListResponse,
+    Benchmark,
+    ListBenchmarksResponse,
+    BenchmarkListResponse,
 )
 ```
 
 Methods:
 
-- <code title="get /v1/eval-tasks/{eval_task_id}">client.eval_tasks.<a href="./src/llama_stack_client/resources/eval_tasks.py">retrieve</a>(eval_task_id) -> <a href="./src/llama_stack_client/types/eval_task.py">Optional[EvalTask]</a></code>
-- <code title="get /v1/eval-tasks">client.eval_tasks.<a href="./src/llama_stack_client/resources/eval_tasks.py">list</a>() -> <a href="./src/llama_stack_client/types/eval_task_list_response.py">EvalTaskListResponse</a></code>
-- <code title="post /v1/eval-tasks">client.eval_tasks.<a href="./src/llama_stack_client/resources/eval_tasks.py">register</a>(\*\*<a href="src/llama_stack_client/types/eval_task_register_params.py">params</a>) -> None</code>
+- <code title="get /v1/eval-tasks/{benchmark_id}">client.benchmarks.<a href="./src/llama_stack_client/resources/benchmarks.py">retrieve</a>(benchmark_id) -> <a href="./src/llama_stack_client/types/benchmark.py">Optional[Benchmark]</a></code>
+- <code title="get /v1/eval-tasks">client.benchmarks.<a href="./src/llama_stack_client/resources/benchmarks.py">list</a>() -> <a href="./src/llama_stack_client/types/benchmark_list_response.py">BenchmarkListResponse</a></code>
+- <code title="post /v1/eval-tasks">client.benchmarks.<a href="./src/llama_stack_client/resources/benchmarks.py">register</a>(\*\*<a href="src/llama_stack_client/types/benchmark_register_params.py">params</a>) -> None</code>
diff --git a/llama_stack/apis/eval_tasks/__init__.py b/llama_stack/apis/benchmarks/__init__.py
similarity index 81%
rename from llama_stack/apis/eval_tasks/__init__.py
rename to llama_stack/apis/benchmarks/__init__.py
index 7ca2167068..f8f5649570 100644
--- a/llama_stack/apis/eval_tasks/__init__.py
+++ b/llama_stack/apis/benchmarks/__init__.py
@@ -4,4 +4,4 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from .eval_tasks import *  # noqa: F401 F403
+from .benchmarks import *  # noqa: F401 F403
diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py
new file mode 100644
index 0000000000..50019b18c7
--- /dev/null
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@@ -0,0 +1,86 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
+
+from llama_models.schema_utils import json_schema_type, webmethod
+from pydantic import BaseModel, Field
+
+from llama_stack.apis.resource import Resource, ResourceType
+
+
+class CommonBenchmarkFields(BaseModel):
+    dataset_id: str
+    scoring_functions: List[str]
+    metadata: Dict[str, Any] = Field(
+        default_factory=dict,
+        description="Metadata for this evaluation task",
+    )
+
+
+@json_schema_type
+class Benchmark(CommonBenchmarkFields, Resource):
+    type: Literal[ResourceType.benchmark.value] = ResourceType.benchmark.value
+
+    @property
+    def benchmark_id(self) -> str:
+        return self.identifier
+
+    @property
+    def provider_benchmark_id(self) -> str:
+        return self.provider_resource_id
+
+
+class BenchmarkInput(CommonBenchmarkFields, BaseModel):
+    benchmark_id: str
+    provider_id: Optional[str] = None
+    provider_benchmark_id: Optional[str] = None
+
+
+class ListBenchmarksResponse(BaseModel):
+    data: List[Benchmark]
+
+
+@runtime_checkable
+class Benchmarks(Protocol):
+    @webmethod(route="/eval/benchmarks", method="GET")
+    async def list_benchmarks(self) -> ListBenchmarksResponse: ...
+
+    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET")
+    async def get_benchmark(
+        self,
+        benchmark_id: str,
+    ) -> Optional[Benchmark]: ...
+
+    @webmethod(route="/eval/benchmarks", method="POST")
+    async def register_benchmark(
+        self,
+        benchmark_id: str,
+        dataset_id: str,
+        scoring_functions: List[str],
+        provider_benchmark_id: Optional[str] = None,
+        provider_id: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> None: ...
+
+    @webmethod(route="/eval-tasks", method="GET")
+    async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse: ...
+
+    @webmethod(route="/eval-tasks/{task_id}", method="GET")
+    async def DEPRECATED_get_eval_task(
+        self,
+        eval_task_id: str,
+    ) -> Optional[Benchmark]: ...
+
+    @webmethod(route="/eval-tasks", method="POST")
+    async def DEPRECATED_register_eval_task(
+        self,
+        eval_task_id: str,
+        dataset_id: str,
+        scoring_functions: List[str],
+        provider_benchmark_id: Optional[str] = None,
+        provider_id: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> None: ...
diff --git a/llama_stack/apis/datatypes.py b/llama_stack/apis/datatypes.py
index ccc395b80b..0751b2c9b2 100644
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@@ -28,7 +28,7 @@ class Api(Enum):
     vector_dbs = "vector_dbs"
     datasets = "datasets"
     scoring_functions = "scoring_functions"
-    eval_tasks = "eval_tasks"
+    benchmarks = "benchmarks"
     tool_groups = "tool_groups"
 
     # built-in API
diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py
index ae13a5bd95..e5c7821503 100644
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@@ -38,19 +38,9 @@ class AgentCandidate(BaseModel):
 
 
 @json_schema_type
-class BenchmarkEvalTaskConfig(BaseModel):
+class BenchmarkConfig(BaseModel):
     type: Literal["benchmark"] = "benchmark"
     eval_candidate: EvalCandidate
-    num_examples: Optional[int] = Field(
-        description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated",
-        default=None,
-    )
-
-
-@json_schema_type
-class AppEvalTaskConfig(BaseModel):
-    type: Literal["app"] = "app"
-    eval_candidate: EvalCandidate
     scoring_params: Dict[str, ScoringFnParams] = Field(
         description="Map between scoring function id and parameters for each scoring function you want to run",
         default_factory=dict,
@@ -62,12 +52,6 @@ class AppEvalTaskConfig(BaseModel):
     # we could optinally add any specific dataset config here
 
 
-EvalTaskConfig = register_schema(
-    Annotated[Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type")],
-    name="EvalTaskConfig",
-)
-
-
 @json_schema_type
 class EvaluateResponse(BaseModel):
     generations: List[Dict[str, Any]]
@@ -76,27 +60,52 @@ class EvaluateResponse(BaseModel):
 
 
 class Eval(Protocol):
-    @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
+    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST")
     async def run_eval(
+        self,
+        benchmark_id: str,
+        task_config: BenchmarkConfig,
+    ) -> Job: ...
+
+    @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST")
+    async def evaluate_rows(
+        self,
+        benchmark_id: str,
+        input_rows: List[Dict[str, Any]],
+        scoring_functions: List[str],
+        task_config: BenchmarkConfig,
+    ) -> EvaluateResponse: ...
+
+    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET")
+    async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: ...
+
+    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE")
+    async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ...
+
+    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
+    async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ...
+
+    @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
+    async def DEPRECATED_run_eval(
         self,
         task_id: str,
-        task_config: EvalTaskConfig,
+        task_config: BenchmarkConfig,
     ) -> Job: ...
 
     @webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST")
-    async def evaluate_rows(
+    async def DEPRECATED_evaluate_rows(
         self,
         task_id: str,
         input_rows: List[Dict[str, Any]],
         scoring_functions: List[str],
-        task_config: EvalTaskConfig,
+        task_config: BenchmarkConfig,
     ) -> EvaluateResponse: ...
 
     @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET")
-    async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
+    async def DEPRECATED_job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
 
     @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE")
-    async def job_cancel(self, task_id: str, job_id: str) -> None: ...
+    async def DEPRECATED_job_cancel(self, task_id: str, job_id: str) -> None: ...
 
     @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET")
-    async def job_result(self, job_id: str, task_id: str) -> EvaluateResponse: ...
+    async def DEPRECATED_job_result(self, task_id: str, job_id: str) -> EvaluateResponse: ...
diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py
deleted file mode 100644
index a0a5330553..0000000000
--- a/llama_stack/apis/eval_tasks/eval_tasks.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
-
-from llama_models.schema_utils import json_schema_type, webmethod
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.resource import Resource, ResourceType
-
-
-class CommonEvalTaskFields(BaseModel):
-    dataset_id: str
-    scoring_functions: List[str]
-    metadata: Dict[str, Any] = Field(
-        default_factory=dict,
-        description="Metadata for this evaluation task",
-    )
-
-
-@json_schema_type
-class EvalTask(CommonEvalTaskFields, Resource):
-    type: Literal[ResourceType.eval_task.value] = ResourceType.eval_task.value
-
-    @property
-    def eval_task_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_eval_task_id(self) -> str:
-        return self.provider_resource_id
-
-
-class EvalTaskInput(CommonEvalTaskFields, BaseModel):
-    eval_task_id: str
-    provider_id: Optional[str] = None
-    provider_eval_task_id: Optional[str] = None
-
-
-class ListEvalTasksResponse(BaseModel):
-    data: List[EvalTask]
-
-
-@runtime_checkable
-class EvalTasks(Protocol):
-    @webmethod(route="/eval-tasks", method="GET")
-    async def list_eval_tasks(self) -> ListEvalTasksResponse: ...
-
-    @webmethod(route="/eval-tasks/{eval_task_id}", method="GET")
-    async def get_eval_task(
-        self,
-        eval_task_id: str,
-    ) -> Optional[EvalTask]: ...
-
-    @webmethod(route="/eval-tasks", method="POST")
-    async def register_eval_task(
-        self,
-        eval_task_id: str,
-        dataset_id: str,
-        scoring_functions: List[str],
-        provider_eval_task_id: Optional[str] = None,
-        provider_id: Optional[str] = None,
-        metadata: Optional[Dict[str, Any]] = None,
-    ) -> None: ...
diff --git a/llama_stack/apis/resource.py b/llama_stack/apis/resource.py
index 145113a5d6..70ec63c55d 100644
--- a/llama_stack/apis/resource.py
+++ b/llama_stack/apis/resource.py
@@ -15,7 +15,7 @@ class ResourceType(Enum):
     vector_db = "vector_db"
     dataset = "dataset"
     scoring_function = "scoring_function"
-    eval_task = "eval_task"
+    benchmark = "benchmark"
     tool = "tool"
     tool_group = "tool_group"
 
diff --git a/llama_stack/apis/telemetry/telemetry.py b/llama_stack/apis/telemetry/telemetry.py
index 5622aaeac8..63ae1dc738 100644
--- a/llama_stack/apis/telemetry/telemetry.py
+++ b/llama_stack/apis/telemetry/telemetry.py
@@ -13,8 +13,8 @@
     Literal,
     Optional,
     Protocol,
-    runtime_checkable,
     Union,
+    runtime_checkable,
 )
 
 from llama_models.llama3.api.datatypes import Primitive
diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py
index 97706f22a5..f62996081b 100644
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@@ -8,10 +8,10 @@
 
 from pydantic import BaseModel, Field
 
+from llama_stack.apis.benchmarks import Benchmark, BenchmarkInput
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Dataset, DatasetInput
 from llama_stack.apis.eval import Eval
-from llama_stack.apis.eval_tasks import EvalTask, EvalTaskInput
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.models import Model, ModelInput
 from llama_stack.apis.safety import Safety
@@ -37,7 +37,7 @@
     VectorDB,
     Dataset,
     ScoringFn,
-    EvalTask,
+    Benchmark,
     Tool,
     ToolGroup,
 ]
@@ -50,7 +50,7 @@
         VectorDB,
         Dataset,
         ScoringFn,
-        EvalTask,
+        Benchmark,
         Tool,
         ToolGroup,
     ],
@@ -173,7 +173,7 @@ class StackRunConfig(BaseModel):
     vector_dbs: List[VectorDBInput] = Field(default_factory=list)
     datasets: List[DatasetInput] = Field(default_factory=list)
     scoring_fns: List[ScoringFnInput] = Field(default_factory=list)
-    eval_tasks: List[EvalTaskInput] = Field(default_factory=list)
+    benchmarks: List[BenchmarkInput] = Field(default_factory=list)
     tool_groups: List[ToolGroupInput] = Field(default_factory=list)
 
     server: ServerConfig = Field(
diff --git a/llama_stack/distribution/distribution.py b/llama_stack/distribution/distribution.py
index 2dcf38463b..384e2c3c89 100644
--- a/llama_stack/distribution/distribution.py
+++ b/llama_stack/distribution/distribution.py
@@ -44,7 +44,7 @@ def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]:
             router_api=Api.scoring,
         ),
         AutoRoutedApiInfo(
-            routing_table_api=Api.eval_tasks,
+            routing_table_api=Api.benchmarks,
             router_api=Api.eval,
         ),
         AutoRoutedApiInfo(
diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py
index 353c2971ba..0bc2e774c1 100644
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@@ -9,10 +9,10 @@
 from typing import Any, Dict, List, Set
 
 from llama_stack.apis.agents import Agents
+from llama_stack.apis.benchmarks import Benchmarks
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.eval import Eval
-from llama_stack.apis.eval_tasks import EvalTasks
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.models import Models
@@ -37,8 +37,8 @@
 from llama_stack.distribution.utils.dynamic import instantiate_class_type
 from llama_stack.providers.datatypes import (
     Api,
+    BenchmarksProtocolPrivate,
     DatasetsProtocolPrivate,
-    EvalTasksProtocolPrivate,
     InlineProviderSpec,
     ModelsProtocolPrivate,
     ProviderSpec,
@@ -73,7 +73,7 @@ def api_protocol_map() -> Dict[Api, Any]:
         Api.scoring: Scoring,
         Api.scoring_functions: ScoringFunctions,
         Api.eval: Eval,
-        Api.eval_tasks: EvalTasks,
+        Api.benchmarks: Benchmarks,
         Api.post_training: PostTraining,
         Api.tool_groups: ToolGroups,
         Api.tool_runtime: ToolRuntime,
@@ -92,7 +92,7 @@ def additional_protocols_map() -> Dict[Api, Any]:
             ScoringFunctions,
             Api.scoring_functions,
         ),
-        Api.eval: (EvalTasksProtocolPrivate, EvalTasks, Api.eval_tasks),
+        Api.eval: (BenchmarksProtocolPrivate, Benchmarks, Api.benchmarks),
     }
 
 
diff --git a/llama_stack/distribution/routers/__init__.py b/llama_stack/distribution/routers/__init__.py
index 18197ca7f1..a54f57fb30 100644
--- a/llama_stack/distribution/routers/__init__.py
+++ b/llama_stack/distribution/routers/__init__.py
@@ -11,8 +11,8 @@
 from llama_stack.providers.datatypes import Api, RoutingTable
 
 from .routing_tables import (
+    BenchmarksRoutingTable,
     DatasetsRoutingTable,
-    EvalTasksRoutingTable,
     ModelsRoutingTable,
     ScoringFunctionsRoutingTable,
     ShieldsRoutingTable,
@@ -33,7 +33,7 @@ async def get_routing_table_impl(
         "shields": ShieldsRoutingTable,
         "datasets": DatasetsRoutingTable,
         "scoring_functions": ScoringFunctionsRoutingTable,
-        "eval_tasks": EvalTasksRoutingTable,
+        "benchmarks": BenchmarksRoutingTable,
         "tool_groups": ToolGroupsRoutingTable,
     }
 
diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py
index e716e44b08..f45975189f 100644
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@@ -9,9 +9,8 @@
 from llama_stack.apis.common.content_types import URL, InterleavedContent
 from llama_stack.apis.datasetio import DatasetIO, PaginatedRowsResult
 from llama_stack.apis.eval import (
-    AppEvalTaskConfig,
+    BenchmarkConfig,
     Eval,
-    EvalTaskConfig,
     EvaluateResponse,
     Job,
     JobStatus,
@@ -347,23 +346,23 @@ async def shutdown(self) -> None:
 
     async def run_eval(
         self,
-        task_id: str,
-        task_config: AppEvalTaskConfig,
+        benchmark_id: str,
+        task_config: BenchmarkConfig,
     ) -> Job:
-        return await self.routing_table.get_provider_impl(task_id).run_eval(
-            task_id=task_id,
+        return await self.routing_table.get_provider_impl(benchmark_id).run_eval(
+            benchmark_id=benchmark_id,
             task_config=task_config,
         )
 
     async def evaluate_rows(
         self,
-        task_id: str,
+        benchmark_id: str,
         input_rows: List[Dict[str, Any]],
         scoring_functions: List[str],
-        task_config: EvalTaskConfig,
+        task_config: BenchmarkConfig,
     ) -> EvaluateResponse:
-        return await self.routing_table.get_provider_impl(task_id).evaluate_rows(
-            task_id=task_id,
+        return await self.routing_table.get_provider_impl(benchmark_id).evaluate_rows(
+            benchmark_id=benchmark_id,
             input_rows=input_rows,
             scoring_functions=scoring_functions,
             task_config=task_config,
@@ -371,31 +370,73 @@ async def evaluate_rows(
 
     async def job_status(
         self,
-        task_id: str,
+        benchmark_id: str,
         job_id: str,
     ) -> Optional[JobStatus]:
-        return await self.routing_table.get_provider_impl(task_id).job_status(task_id, job_id)
+        return await self.routing_table.get_provider_impl(benchmark_id).job_status(benchmark_id, job_id)
 
     async def job_cancel(
         self,
-        task_id: str,
+        benchmark_id: str,
         job_id: str,
     ) -> None:
-        await self.routing_table.get_provider_impl(task_id).job_cancel(
-            task_id,
+        await self.routing_table.get_provider_impl(benchmark_id).job_cancel(
+            benchmark_id,
             job_id,
         )
 
     async def job_result(
         self,
-        task_id: str,
+        benchmark_id: str,
         job_id: str,
     ) -> EvaluateResponse:
-        return await self.routing_table.get_provider_impl(task_id).job_result(
-            task_id,
+        return await self.routing_table.get_provider_impl(benchmark_id).job_result(
+            benchmark_id,
             job_id,
         )
 
+    async def DEPRECATED_run_eval(
+        self,
+        task_id: str,
+        task_config: BenchmarkConfig,
+    ) -> Job:
+        return await self.run_eval(benchmark_id=task_id, task_config=task_config)
+
+    async def DEPRECATED_evaluate_rows(
+        self,
+        task_id: str,
+        input_rows: List[Dict[str, Any]],
+        scoring_functions: List[str],
+        task_config: BenchmarkConfig,
+    ) -> EvaluateResponse:
+        return await self.evaluate_rows(
+            benchmark_id=task_id,
+            input_rows=input_rows,
+            scoring_functions=scoring_functions,
+            task_config=task_config,
+        )
+
+    async def DEPRECATED_job_status(
+        self,
+        task_id: str,
+        job_id: str,
+    ) -> Optional[JobStatus]:
+        return await self.job_status(benchmark_id=task_id, job_id=job_id)
+
+    async def DEPRECATED_job_cancel(
+        self,
+        task_id: str,
+        job_id: str,
+    ) -> None:
+        return await self.job_cancel(benchmark_id=task_id, job_id=job_id)
+
+    async def DEPRECATED_job_result(
+        self,
+        task_id: str,
+        job_id: str,
+    ) -> EvaluateResponse:
+        return await self.job_result(benchmark_id=task_id, job_id=job_id)
+
 
 class ToolRuntimeRouter(ToolRuntime):
     class RagToolImpl(RAGToolRuntime):
diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index 009775ca52..2cddc3970d 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -4,14 +4,15 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import logging
 from typing import Any, Dict, List, Optional
 
 from pydantic import TypeAdapter
 
+from llama_stack.apis.benchmarks import Benchmark, Benchmarks, ListBenchmarksResponse
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.common.type_system import ParamType
 from llama_stack.apis.datasets import Dataset, Datasets, ListDatasetsResponse
-from llama_stack.apis.eval_tasks import EvalTask, EvalTasks, ListEvalTasksResponse
 from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType
 from llama_stack.apis.resource import ResourceType
 from llama_stack.apis.scoring_functions import (
@@ -38,6 +39,8 @@
 from llama_stack.distribution.store import DistributionRegistry
 from llama_stack.providers.datatypes import Api, RoutingTable
 
+logger = logging.getLogger(__name__)
+
 
 def get_impl_api(p: Any) -> Api:
     return p.__provider_spec__.api
@@ -60,7 +63,7 @@ async def register_object_with_provider(obj: RoutableObject, p: Any) -> Routable
     elif api == Api.scoring:
         return await p.register_scoring_function(obj)
     elif api == Api.eval:
-        return await p.register_eval_task(obj)
+        return await p.register_benchmark(obj)
     elif api == Api.tool_runtime:
         return await p.register_tool(obj)
     else:
@@ -121,7 +124,7 @@ async def add_objects(objs: List[RoutableObjectWithProvider], provider_id: str,
                 scoring_functions = await p.list_scoring_functions()
                 await add_objects(scoring_functions, pid, ScoringFn)
             elif api == Api.eval:
-                p.eval_task_store = self
+                p.benchmark_store = self
             elif api == Api.tool_runtime:
                 p.tool_store = self
 
@@ -141,8 +144,8 @@ def apiname_object():
                 return ("DatasetIO", "dataset")
             elif isinstance(self, ScoringFunctionsRoutingTable):
                 return ("Scoring", "scoring_function")
-            elif isinstance(self, EvalTasksRoutingTable):
-                return ("Eval", "eval_task")
+            elif isinstance(self, BenchmarksRoutingTable):
+                return ("Eval", "benchmark")
             elif isinstance(self, ToolGroupsRoutingTable):
                 return ("Tools", "tool")
             else:
@@ -428,20 +431,20 @@ async def register_scoring_function(
         await self.register_object(scoring_fn)
 
 
-class EvalTasksRoutingTable(CommonRoutingTableImpl, EvalTasks):
-    async def list_eval_tasks(self) -> ListEvalTasksResponse:
-        return ListEvalTasksResponse(data=await self.get_all_with_type("eval_task"))
+class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
+    async def list_benchmarks(self) -> ListBenchmarksResponse:
+        return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark"))
 
-    async def get_eval_task(self, eval_task_id: str) -> Optional[EvalTask]:
-        return await self.get_object_by_identifier("eval_task", eval_task_id)
+    async def get_benchmark(self, benchmark_id: str) -> Optional[Benchmark]:
+        return await self.get_object_by_identifier("benchmark", benchmark_id)
 
-    async def register_eval_task(
+    async def register_benchmark(
         self,
-        eval_task_id: str,
+        benchmark_id: str,
         dataset_id: str,
         scoring_functions: List[str],
         metadata: Optional[Dict[str, Any]] = None,
-        provider_eval_task_id: Optional[str] = None,
+        provider_benchmark_id: Optional[str] = None,
         provider_id: Optional[str] = None,
     ) -> None:
         if metadata is None:
@@ -453,17 +456,46 @@ async def register_eval_task(
                 raise ValueError(
                     "No provider specified and multiple providers available. Please specify a provider_id."
                 )
-        if provider_eval_task_id is None:
-            provider_eval_task_id = eval_task_id
-        eval_task = EvalTask(
-            identifier=eval_task_id,
+        if provider_benchmark_id is None:
+            provider_benchmark_id = benchmark_id
+        benchmark = Benchmark(
+            identifier=benchmark_id,
             dataset_id=dataset_id,
             scoring_functions=scoring_functions,
             metadata=metadata,
             provider_id=provider_id,
-            provider_resource_id=provider_eval_task_id,
+            provider_resource_id=provider_benchmark_id,
+        )
+        await self.register_object(benchmark)
+
+    async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse:
+        logger.warning("DEPRECATED: Use /eval/benchmarks instead")
+        return await self.list_benchmarks()
+
+    async def DEPRECATED_get_eval_task(
+        self,
+        eval_task_id: str,
+    ) -> Optional[Benchmark]:
+        logger.warning("DEPRECATED: Use /eval/benchmarks instead")
+        return await self.get_benchmark(eval_task_id)
+
+    async def DEPRECATED_register_eval_task(
+        self,
+        eval_task_id: str,
+        dataset_id: str,
+        scoring_functions: List[str],
+        provider_benchmark_id: Optional[str] = None,
+        provider_id: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        logger.warning("DEPRECATED: Use /eval/benchmarks instead")
+        return await self.register_benchmark(
+            benchmark_id=eval_task_id,
+            dataset_id=dataset_id,
+            scoring_functions=scoring_functions,
+            metadata=metadata,
+            provider_benchmark_id=provider_benchmark_id,
         )
-        await self.register_object(eval_task)
 
 
 class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
diff --git a/llama_stack/distribution/stack.py b/llama_stack/distribution/stack.py
index 2baad8ac45..9335dc3a95 100644
--- a/llama_stack/distribution/stack.py
+++ b/llama_stack/distribution/stack.py
@@ -15,10 +15,10 @@
 
 from llama_stack.apis.agents import Agents
 from llama_stack.apis.batch_inference import BatchInference
+from llama_stack.apis.benchmarks import Benchmarks
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.eval import Eval
-from llama_stack.apis.eval_tasks import EvalTasks
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.models import Models
@@ -53,7 +53,7 @@ class LlamaStack(
     PostTraining,
     VectorIO,
     Eval,
-    EvalTasks,
+    Benchmarks,
     Scoring,
     ScoringFunctions,
     DatasetIO,
@@ -78,7 +78,7 @@ class LlamaStack(
         "register_scoring_function",
         "list_scoring_functions",
     ),
-    ("eval_tasks", Api.eval_tasks, "register_eval_task", "list_eval_tasks"),
+    ("benchmarks", Api.benchmarks, "register_benchmark", "list_benchmarks"),
     ("tool_groups", Api.tool_groups, "register_tool_group", "list_tool_groups"),
 ]
 
diff --git a/llama_stack/distribution/ui/README.md b/llama_stack/distribution/ui/README.md
index c0a2597af5..8fceb5c63c 100644
--- a/llama_stack/distribution/ui/README.md
+++ b/llama_stack/distribution/ui/README.md
@@ -26,7 +26,7 @@ $ llama-stack-client datasets register \
 ```
 
 ```bash
-$ llama-stack-client eval_tasks register \
+$ llama-stack-client benchmarks register \
 --eval-task-id meta-reference-mmlu \
 --provider-id meta-reference \
 --dataset-id mmlu \
diff --git a/llama_stack/distribution/ui/page/distribution/eval_tasks.py b/llama_stack/distribution/ui/page/distribution/eval_tasks.py
index f589696631..1428ae9ab2 100644
--- a/llama_stack/distribution/ui/page/distribution/eval_tasks.py
+++ b/llama_stack/distribution/ui/page/distribution/eval_tasks.py
@@ -8,12 +8,12 @@
 from modules.api import llama_stack_api
 
 
-def eval_tasks():
-    # Eval Tasks Section
-    st.header("Eval Tasks")
+def benchmarks():
+    # Benchmarks Section
+    st.header("Benchmarks")
 
-    eval_tasks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.eval_tasks.list()}
+    benchmarks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.benchmarks.list()}
 
-    if len(eval_tasks_info) > 0:
-        selected_eval_task = st.selectbox("Select an eval task", list(eval_tasks_info.keys()), key="eval_task_inspect")
-        st.json(eval_tasks_info[selected_eval_task], expanded=True)
+    if len(benchmarks_info) > 0:
+        selected_benchmark = st.selectbox("Select an eval task", list(benchmarks_info.keys()), key="benchmark_inspect")
+        st.json(benchmarks_info[selected_benchmark], expanded=True)
diff --git a/llama_stack/distribution/ui/page/distribution/resources.py b/llama_stack/distribution/ui/page/distribution/resources.py
index 94b840bcb8..684270d4de 100644
--- a/llama_stack/distribution/ui/page/distribution/resources.py
+++ b/llama_stack/distribution/ui/page/distribution/resources.py
@@ -4,8 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+from page.distribution.benchmarks import benchmarks
 from page.distribution.datasets import datasets
-from page.distribution.eval_tasks import eval_tasks
 from page.distribution.models import models
 from page.distribution.scoring_functions import scoring_functions
 from page.distribution.shields import shields
@@ -20,7 +20,7 @@ def resources_page():
         "Shields",
         "Scoring Functions",
         "Datasets",
-        "Eval Tasks",
+        "Benchmarks",
     ]
     icons = ["magic", "memory", "shield", "file-bar-graph", "database", "list-task"]
     selected_resource = option_menu(
@@ -34,8 +34,8 @@ def resources_page():
             },
         },
     )
-    if selected_resource == "Eval Tasks":
-        eval_tasks()
+    if selected_resource == "Benchmarks":
+        benchmarks()
     elif selected_resource == "Vector Databases":
         vector_dbs()
     elif selected_resource == "Datasets":
diff --git a/llama_stack/distribution/ui/page/evaluations/native_eval.py b/llama_stack/distribution/ui/page/evaluations/native_eval.py
index 112d9cff02..f1cae714a9 100644
--- a/llama_stack/distribution/ui/page/evaluations/native_eval.py
+++ b/llama_stack/distribution/ui/page/evaluations/native_eval.py
@@ -11,28 +11,28 @@
 from modules.api import llama_stack_api
 
 
-def select_eval_task_1():
-    # Select Eval Tasks
+def select_benchmark_1():
+    # Select Benchmarks
     st.subheader("1. Choose An Eval Task")
-    eval_tasks = llama_stack_api.client.eval_tasks.list()
-    eval_tasks = {et.identifier: et for et in eval_tasks}
-    eval_tasks_names = list(eval_tasks.keys())
-    selected_eval_task = st.selectbox(
+    benchmarks = llama_stack_api.client.benchmarks.list()
+    benchmarks = {et.identifier: et for et in benchmarks}
+    benchmarks_names = list(benchmarks.keys())
+    selected_benchmark = st.selectbox(
         "Choose an eval task.",
-        options=eval_tasks_names,
+        options=benchmarks_names,
         help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
     )
     with st.expander("View Eval Task"):
-        st.json(eval_tasks[selected_eval_task], expanded=True)
+        st.json(benchmarks[selected_benchmark], expanded=True)
 
-    st.session_state["selected_eval_task"] = selected_eval_task
-    st.session_state["eval_tasks"] = eval_tasks
+    st.session_state["selected_benchmark"] = selected_benchmark
+    st.session_state["benchmarks"] = benchmarks
     if st.button("Confirm", key="confirm_1"):
-        st.session_state["selected_eval_task_1_next"] = True
+        st.session_state["selected_benchmark_1_next"] = True
 
 
 def define_eval_candidate_2():
-    if not st.session_state.get("selected_eval_task_1_next", None):
+    if not st.session_state.get("selected_benchmark_1_next", None):
         return
 
     st.subheader("2. Define Eval Candidate")
@@ -161,11 +161,11 @@ def run_evaluation_3():
         Review the configurations that will be used for this evaluation run, make any necessary changes, and then click the "Run Evaluation" button.
         """
     )
-    selected_eval_task = st.session_state["selected_eval_task"]
-    eval_tasks = st.session_state["eval_tasks"]
+    selected_benchmark = st.session_state["selected_benchmark"]
+    benchmarks = st.session_state["benchmarks"]
     eval_candidate = st.session_state["eval_candidate"]
 
-    dataset_id = eval_tasks[selected_eval_task].dataset_id
+    dataset_id = benchmarks[selected_benchmark].dataset_id
     rows = llama_stack_api.client.datasetio.get_rows_paginated(
         dataset_id=dataset_id,
         rows_in_page=-1,
@@ -180,16 +180,16 @@ def run_evaluation_3():
         help="Number of examples from the dataset to evaluate. ",
     )
 
-    eval_task_config = {
+    benchmark_config = {
         "type": "benchmark",
         "eval_candidate": eval_candidate,
         "scoring_params": {},
     }
 
     with st.expander("View Evaluation Task", expanded=True):
-        st.json(eval_tasks[selected_eval_task], expanded=True)
+        st.json(benchmarks[selected_benchmark], expanded=True)
     with st.expander("View Evaluation Task Configuration", expanded=True):
-        st.json(eval_task_config, expanded=True)
+        st.json(benchmark_config, expanded=True)
 
     # Add run button and handle evaluation
     if st.button("Run Evaluation"):
@@ -209,10 +209,10 @@ def run_evaluation_3():
             progress_bar.progress(progress, text=progress_text)
             # Run evaluation for current row
             eval_res = llama_stack_api.client.eval.evaluate_rows(
-                task_id=selected_eval_task,
+                benchmark_id=selected_benchmark,
                 input_rows=[r],
-                scoring_functions=eval_tasks[selected_eval_task].scoring_functions,
-                task_config=eval_task_config,
+                scoring_functions=benchmarks[selected_benchmark].scoring_functions,
+                task_config=benchmark_config,
             )
 
             for k in r.keys():
@@ -225,7 +225,7 @@ def run_evaluation_3():
                     output_res[k] = []
                 output_res[k].append(eval_res.generations[0][k])
 
-            for scoring_fn in eval_tasks[selected_eval_task].scoring_functions:
+            for scoring_fn in benchmarks[selected_benchmark].scoring_functions:
                 if scoring_fn not in output_res:
                     output_res[scoring_fn] = []
                 output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0])
@@ -245,7 +245,7 @@ def native_evaluation_page():
     st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙")
     st.title("📊 Evaluations (Generation + Scoring)")
 
-    select_eval_task_1()
+    select_benchmark_1()
     define_eval_candidate_2()
     run_evaluation_3()
 
diff --git a/llama_stack/providers/datatypes.py b/llama_stack/providers/datatypes.py
index ccdaf76e74..b92f9dc0a0 100644
--- a/llama_stack/providers/datatypes.py
+++ b/llama_stack/providers/datatypes.py
@@ -10,9 +10,9 @@
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field
 
+from llama_stack.apis.benchmarks import Benchmark
 from llama_stack.apis.datasets import Dataset
 from llama_stack.apis.datatypes import Api
-from llama_stack.apis.eval_tasks import EvalTask
 from llama_stack.apis.models import Model
 from llama_stack.apis.scoring_functions import ScoringFn
 from llama_stack.apis.shields import Shield
@@ -48,8 +48,8 @@ async def list_scoring_functions(self) -> List[ScoringFn]: ...
     async def register_scoring_function(self, scoring_fn: ScoringFn) -> None: ...
 
 
-class EvalTasksProtocolPrivate(Protocol):
-    async def register_eval_task(self, eval_task: EvalTask) -> None: ...
+class BenchmarksProtocolPrivate(Protocol):
+    async def register_benchmark(self, benchmark: Benchmark) -> None: ...
 
 
 class ToolsProtocolPrivate(Protocol):
diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py
index 1c44caf7f0..cd99c9ad89 100644
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@@ -8,13 +8,13 @@
 from tqdm import tqdm
 
 from llama_stack.apis.agents import Agents, StepType
+from llama_stack.apis.benchmarks import Benchmark
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.eval_tasks import EvalTask
 from llama_stack.apis.inference import Inference, UserMessage
 from llama_stack.apis.scoring import Scoring
 from llama_stack.distribution.datatypes import Api
-from llama_stack.providers.datatypes import EvalTasksProtocolPrivate
+from llama_stack.providers.datatypes import BenchmarksProtocolPrivate
 from llama_stack.providers.inline.agents.meta_reference.agent_instance import (
     MEMORY_QUERY_TOOL,
 )
@@ -26,15 +26,15 @@
 from llama_stack.providers.utils.kvstore import kvstore_impl
 
 from .....apis.common.job_types import Job
-from .....apis.eval.eval import Eval, EvalTaskConfig, EvaluateResponse, JobStatus
+from .....apis.eval.eval import BenchmarkConfig, Eval, EvaluateResponse, JobStatus
 from .config import MetaReferenceEvalConfig
 
-EVAL_TASKS_PREFIX = "eval_tasks:"
+EVAL_TASKS_PREFIX = "benchmarks:"
 
 
 class MetaReferenceEvalImpl(
     Eval,
-    EvalTasksProtocolPrivate,
+    BenchmarksProtocolPrivate,
 ):
     def __init__(
         self,
@@ -55,36 +55,36 @@ def __init__(
         # TODO: assume sync job, will need jobs API for async scheduling
         self.jobs = {}
 
-        self.eval_tasks = {}
+        self.benchmarks = {}
 
     async def initialize(self) -> None:
         self.kvstore = await kvstore_impl(self.config.kvstore)
-        # Load existing eval_tasks from kvstore
+        # Load existing benchmarks from kvstore
         start_key = EVAL_TASKS_PREFIX
         end_key = f"{EVAL_TASKS_PREFIX}\xff"
-        stored_eval_tasks = await self.kvstore.range(start_key, end_key)
+        stored_benchmarks = await self.kvstore.range(start_key, end_key)
 
-        for eval_task in stored_eval_tasks:
-            eval_task = EvalTask.model_validate_json(eval_task)
-            self.eval_tasks[eval_task.identifier] = eval_task
+        for benchmark in stored_benchmarks:
+            benchmark = Benchmark.model_validate_json(benchmark)
+            self.benchmarks[benchmark.identifier] = benchmark
 
     async def shutdown(self) -> None: ...
 
-    async def register_eval_task(self, task_def: EvalTask) -> None:
+    async def register_benchmark(self, task_def: Benchmark) -> None:
         # Store in kvstore
         key = f"{EVAL_TASKS_PREFIX}{task_def.identifier}"
         await self.kvstore.set(
             key=key,
             value=task_def.model_dump_json(),
         )
-        self.eval_tasks[task_def.identifier] = task_def
+        self.benchmarks[task_def.identifier] = task_def
 
     async def run_eval(
         self,
-        task_id: str,
-        task_config: EvalTaskConfig,
+        benchmark_id: str,
+        task_config: BenchmarkConfig,
     ) -> Job:
-        task_def = self.eval_tasks[task_id]
+        task_def = self.benchmarks[benchmark_id]
         dataset_id = task_def.dataset_id
         candidate = task_config.eval_candidate
         scoring_functions = task_def.scoring_functions
@@ -95,7 +95,7 @@ async def run_eval(
             rows_in_page=(-1 if task_config.num_examples is None else task_config.num_examples),
         )
         res = await self.evaluate_rows(
-            task_id=task_id,
+            benchmark_id=benchmark_id,
             input_rows=all_rows.rows,
             scoring_functions=scoring_functions,
             task_config=task_config,
@@ -108,7 +108,7 @@ async def run_eval(
         return Job(job_id=job_id)
 
     async def _run_agent_generation(
-        self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig
+        self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig
     ) -> List[Dict[str, Any]]:
         candidate = task_config.eval_candidate
         create_response = await self.agents_api.create_agent(candidate.config)
@@ -151,7 +151,7 @@ async def _run_agent_generation(
         return generations
 
     async def _run_model_generation(
-        self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig
+        self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig
     ) -> List[Dict[str, Any]]:
         candidate = task_config.eval_candidate
         assert candidate.sampling_params.max_tokens is not None, "SamplingParams.max_tokens must be provided"
@@ -187,10 +187,10 @@ async def _run_model_generation(
 
     async def evaluate_rows(
         self,
-        task_id: str,
+        benchmark_id: str,
         input_rows: List[Dict[str, Any]],
         scoring_functions: List[str],
-        task_config: EvalTaskConfig,
+        task_config: BenchmarkConfig,
     ) -> EvaluateResponse:
         candidate = task_config.eval_candidate
         if candidate.type == "agent":
@@ -203,7 +203,7 @@ async def evaluate_rows(
         # scoring with generated_answer
         score_input_rows = [input_r | generated_r for input_r, generated_r in zip(input_rows, generations)]
 
-        if task_config.type == "app" and task_config.scoring_params is not None:
+        if task_config.scoring_params is not None:
             scoring_functions_dict = {
                 scoring_fn_id: task_config.scoring_params.get(scoring_fn_id, None)
                 for scoring_fn_id in scoring_functions
@@ -217,18 +217,60 @@ async def evaluate_rows(
 
         return EvaluateResponse(generations=generations, scores=score_response.results)
 
-    async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]:
+    async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]:
         if job_id in self.jobs:
             return JobStatus.completed
 
         return None
 
-    async def job_cancel(self, task_id: str, job_id: str) -> None:
+    async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
         raise NotImplementedError("Job cancel is not implemented yet")
 
-    async def job_result(self, task_id: str, job_id: str) -> EvaluateResponse:
-        status = await self.job_status(task_id, job_id)
+    async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
+        status = await self.job_status(benchmark_id, job_id)
         if not status or status != JobStatus.completed:
             raise ValueError(f"Job is not completed, Status: {status.value}")
 
         return self.jobs[job_id]
+
+    async def DEPRECATED_run_eval(
+        self,
+        task_id: str,
+        task_config: BenchmarkConfig,
+    ) -> Job:
+        return await self.run_eval(benchmark_id=task_id, task_config=task_config)
+
+    async def DEPRECATED_evaluate_rows(
+        self,
+        task_id: str,
+        input_rows: List[Dict[str, Any]],
+        scoring_functions: List[str],
+        task_config: BenchmarkConfig,
+    ) -> EvaluateResponse:
+        return await self.evaluate_rows(
+            benchmark_id=task_id,
+            input_rows=input_rows,
+            scoring_functions=scoring_functions,
+            task_config=task_config,
+        )
+
+    async def DEPRECATED_job_status(
+        self,
+        task_id: str,
+        job_id: str,
+    ) -> Optional[JobStatus]:
+        return await self.job_status(benchmark_id=task_id, job_id=job_id)
+
+    async def DEPRECATED_job_cancel(
+        self,
+        task_id: str,
+        job_id: str,
+    ) -> None:
+        return await self.job_cancel(benchmark_id=task_id, job_id=job_id)
+
+    async def DEPRECATED_job_result(
+        self,
+        task_id: str,
+        job_id: str,
+    ) -> EvaluateResponse:
+        return await self.job_result(benchmark_id=task_id, job_id=job_id)
diff --git a/llama_stack/providers/tests/eval/test_eval.py b/llama_stack/providers/tests/eval/test_eval.py
index ec3d08728b..ad80b8601c 100644
--- a/llama_stack/providers/tests/eval/test_eval.py
+++ b/llama_stack/providers/tests/eval/test_eval.py
@@ -10,8 +10,8 @@
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.common.type_system import ChatCompletionInputType, StringType
 from llama_stack.apis.eval.eval import (
-    AppEvalTaskConfig,
-    BenchmarkEvalTaskConfig,
+    AppBenchmarkConfig,
+    BenchmarkBenchmarkConfig,
     ModelCandidate,
 )
 from llama_stack.apis.inference import SamplingParams
@@ -30,18 +30,18 @@
 
 class Testeval:
     @pytest.mark.asyncio
-    async def test_eval_tasks_list(self, eval_stack):
+    async def test_benchmarks_list(self, eval_stack):
         # NOTE: this needs you to ensure that you are starting from a clean state
         # but so far we don't have an unregister API unfortunately, so be careful
-        eval_tasks_impl = eval_stack[Api.eval_tasks]
-        response = await eval_tasks_impl.list_eval_tasks()
+        benchmarks_impl = eval_stack[Api.benchmarks]
+        response = await benchmarks_impl.list_benchmarks()
         assert isinstance(response, list)
 
     @pytest.mark.asyncio
     async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model):
-        eval_impl, eval_tasks_impl, datasetio_impl, datasets_impl, models_impl = (
+        eval_impl, benchmarks_impl, datasetio_impl, datasets_impl, models_impl = (
             eval_stack[Api.eval],
-            eval_stack[Api.eval_tasks],
+            eval_stack[Api.benchmarks],
             eval_stack[Api.datasetio],
             eval_stack[Api.datasets],
             eval_stack[Api.models],
@@ -59,17 +59,17 @@ async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model
         scoring_functions = [
             "basic::equality",
         ]
-        task_id = "meta-reference::app_eval"
-        await eval_tasks_impl.register_eval_task(
-            eval_task_id=task_id,
+        benchmark_id = "meta-reference::app_eval"
+        await benchmarks_impl.register_benchmark(
+            benchmark_id=benchmark_id,
             dataset_id="test_dataset_for_eval",
             scoring_functions=scoring_functions,
         )
         response = await eval_impl.evaluate_rows(
-            task_id=task_id,
+            benchmark_id=benchmark_id,
             input_rows=rows.rows,
             scoring_functions=scoring_functions,
-            task_config=AppEvalTaskConfig(
+            task_config=AppBenchmarkConfig(
                 eval_candidate=ModelCandidate(
                     model=inference_model,
                     sampling_params=SamplingParams(),
@@ -92,9 +92,9 @@ async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model
 
     @pytest.mark.asyncio
     async def test_eval_run_eval(self, eval_stack, inference_model, judge_model):
-        eval_impl, eval_tasks_impl, datasets_impl, models_impl = (
+        eval_impl, benchmarks_impl, datasets_impl, models_impl = (
             eval_stack[Api.eval],
-            eval_stack[Api.eval_tasks],
+            eval_stack[Api.benchmarks],
             eval_stack[Api.datasets],
             eval_stack[Api.models],
         )
@@ -105,15 +105,15 @@ async def test_eval_run_eval(self, eval_stack, inference_model, judge_model):
             "basic::subset_of",
         ]
 
-        task_id = "meta-reference::app_eval-2"
-        await eval_tasks_impl.register_eval_task(
-            eval_task_id=task_id,
+        benchmark_id = "meta-reference::app_eval-2"
+        await benchmarks_impl.register_benchmark(
+            benchmark_id=benchmark_id,
             dataset_id="test_dataset_for_eval",
             scoring_functions=scoring_functions,
         )
         response = await eval_impl.run_eval(
-            task_id=task_id,
-            task_config=AppEvalTaskConfig(
+            benchmark_id=benchmark_id,
+            task_config=AppBenchmarkConfig(
                 eval_candidate=ModelCandidate(
                     model=inference_model,
                     sampling_params=SamplingParams(),
@@ -121,9 +121,9 @@ async def test_eval_run_eval(self, eval_stack, inference_model, judge_model):
             ),
         )
         assert response.job_id == "0"
-        job_status = await eval_impl.job_status(task_id, response.job_id)
+        job_status = await eval_impl.job_status(benchmark_id, response.job_id)
         assert job_status and job_status.value == "completed"
-        eval_response = await eval_impl.job_result(task_id, response.job_id)
+        eval_response = await eval_impl.job_result(benchmark_id, response.job_id)
 
         assert eval_response is not None
         assert len(eval_response.generations) == 5
@@ -131,9 +131,9 @@ async def test_eval_run_eval(self, eval_stack, inference_model, judge_model):
 
     @pytest.mark.asyncio
     async def test_eval_run_benchmark_eval(self, eval_stack, inference_model):
-        eval_impl, eval_tasks_impl, datasets_impl, models_impl = (
+        eval_impl, benchmarks_impl, datasets_impl, models_impl = (
             eval_stack[Api.eval],
-            eval_stack[Api.eval_tasks],
+            eval_stack[Api.benchmarks],
             eval_stack[Api.datasets],
             eval_stack[Api.models],
         )
@@ -159,20 +159,20 @@ async def test_eval_run_benchmark_eval(self, eval_stack, inference_model):
         )
 
         # register eval task
-        await eval_tasks_impl.register_eval_task(
-            eval_task_id="meta-reference-mmlu",
+        await benchmarks_impl.register_benchmark(
+            benchmark_id="meta-reference-mmlu",
             dataset_id="mmlu",
             scoring_functions=["basic::regex_parser_multiple_choice_answer"],
         )
 
         # list benchmarks
-        response = await eval_tasks_impl.list_eval_tasks()
+        response = await benchmarks_impl.list_benchmarks()
         assert len(response) > 0
 
         benchmark_id = "meta-reference-mmlu"
         response = await eval_impl.run_eval(
-            task_id=benchmark_id,
-            task_config=BenchmarkEvalTaskConfig(
+            benchmark_id=benchmark_id,
+            task_config=BenchmarkBenchmarkConfig(
                 eval_candidate=ModelCandidate(
                     model=inference_model,
                     sampling_params=SamplingParams(),
diff --git a/llama_stack/providers/tests/resolver.py b/llama_stack/providers/tests/resolver.py
index 0ff6327170..76343b7f48 100644
--- a/llama_stack/providers/tests/resolver.py
+++ b/llama_stack/providers/tests/resolver.py
@@ -10,8 +10,8 @@
 
 from pydantic import BaseModel
 
+from llama_stack.apis.benchmarks import BenchmarkInput
 from llama_stack.apis.datasets import DatasetInput
-from llama_stack.apis.eval_tasks import EvalTaskInput
 from llama_stack.apis.models import ModelInput
 from llama_stack.apis.scoring_functions import ScoringFnInput
 from llama_stack.apis.shields import ShieldInput
@@ -42,7 +42,7 @@ async def construct_stack_for_test(
     vector_dbs: Optional[List[VectorDBInput]] = None,
     datasets: Optional[List[DatasetInput]] = None,
     scoring_fns: Optional[List[ScoringFnInput]] = None,
-    eval_tasks: Optional[List[EvalTaskInput]] = None,
+    benchmarks: Optional[List[BenchmarkInput]] = None,
     tool_groups: Optional[List[ToolGroupInput]] = None,
 ) -> TestStack:
     sqlite_file = tempfile.NamedTemporaryFile(delete=False, suffix=".db")
@@ -56,7 +56,7 @@ async def construct_stack_for_test(
         vector_dbs=vector_dbs or [],
         datasets=datasets or [],
         scoring_fns=scoring_fns or [],
-        eval_tasks=eval_tasks or [],
+        benchmarks=benchmarks or [],
         tool_groups=tool_groups or [],
     )
     run_config = parse_and_maybe_upgrade_config(run_config)
diff --git a/llama_stack/templates/bedrock/run.yaml b/llama_stack/templates/bedrock/run.yaml
index be6c9a928c..7d03b7c29a 100644
--- a/llama_stack/templates/bedrock/run.yaml
+++ b/llama_stack/templates/bedrock/run.yaml
@@ -107,7 +107,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/cerebras/run.yaml b/llama_stack/templates/cerebras/run.yaml
index 05d3f45259..6afff2be21 100644
--- a/llama_stack/templates/cerebras/run.yaml
+++ b/llama_stack/templates/cerebras/run.yaml
@@ -109,7 +109,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/dell/run-with-safety.yaml b/llama_stack/templates/dell/run-with-safety.yaml
index 04c5957d46..ddec3a7153 100644
--- a/llama_stack/templates/dell/run-with-safety.yaml
+++ b/llama_stack/templates/dell/run-with-safety.yaml
@@ -108,7 +108,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: brave-search
diff --git a/llama_stack/templates/dell/run.yaml b/llama_stack/templates/dell/run.yaml
index 706444eb1b..9394c94efe 100644
--- a/llama_stack/templates/dell/run.yaml
+++ b/llama_stack/templates/dell/run.yaml
@@ -99,7 +99,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: brave-search
diff --git a/llama_stack/templates/experimental-post-training/run.yaml b/llama_stack/templates/experimental-post-training/run.yaml
index 75d103c9fa..e70ccdd2de 100644
--- a/llama_stack/templates/experimental-post-training/run.yaml
+++ b/llama_stack/templates/experimental-post-training/run.yaml
@@ -85,4 +85,4 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
diff --git a/llama_stack/templates/fireworks/run-with-safety.yaml b/llama_stack/templates/fireworks/run-with-safety.yaml
index 0fbe14a5a5..8f95e9d59b 100644
--- a/llama_stack/templates/fireworks/run-with-safety.yaml
+++ b/llama_stack/templates/fireworks/run-with-safety.yaml
@@ -164,7 +164,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml
index ccf67dcbb0..64229a5d86 100644
--- a/llama_stack/templates/fireworks/run.yaml
+++ b/llama_stack/templates/fireworks/run.yaml
@@ -153,7 +153,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/hf-endpoint/run-with-safety.yaml b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
index f520a2fdab..867d7a0768 100644
--- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml
+++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
@@ -116,7 +116,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/hf-endpoint/run.yaml b/llama_stack/templates/hf-endpoint/run.yaml
index 708cb1bcc6..d60acdefd9 100644
--- a/llama_stack/templates/hf-endpoint/run.yaml
+++ b/llama_stack/templates/hf-endpoint/run.yaml
@@ -106,7 +106,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/hf-serverless/run-with-safety.yaml b/llama_stack/templates/hf-serverless/run-with-safety.yaml
index 7f0abf5be0..e58ad15b34 100644
--- a/llama_stack/templates/hf-serverless/run-with-safety.yaml
+++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml
@@ -116,7 +116,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/hf-serverless/run.yaml b/llama_stack/templates/hf-serverless/run.yaml
index c0b7a4c605..5045e821af 100644
--- a/llama_stack/templates/hf-serverless/run.yaml
+++ b/llama_stack/templates/hf-serverless/run.yaml
@@ -106,7 +106,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
index c5286fc6be..caac65c8c3 100644
--- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
@@ -118,7 +118,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/meta-reference-gpu/run.yaml b/llama_stack/templates/meta-reference-gpu/run.yaml
index 310585f23f..bade9a076f 100644
--- a/llama_stack/templates/meta-reference-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run.yaml
@@ -107,7 +107,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
index d43cf3917e..f131e8ea65 100644
--- a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
@@ -109,7 +109,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml
index c8ae362f54..14fb283544 100644
--- a/llama_stack/templates/nvidia/run.yaml
+++ b/llama_stack/templates/nvidia/run.yaml
@@ -139,7 +139,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml
index ac5dab7552..9d5bfc7a0e 100644
--- a/llama_stack/templates/ollama/run-with-safety.yaml
+++ b/llama_stack/templates/ollama/run-with-safety.yaml
@@ -113,7 +113,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml
index 3a60fe61f1..9ac1f3267e 100644
--- a/llama_stack/templates/ollama/run.yaml
+++ b/llama_stack/templates/ollama/run.yaml
@@ -110,7 +110,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml
index 1fe998a1f9..dd43f21f67 100644
--- a/llama_stack/templates/remote-vllm/run-with-safety.yaml
+++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml
@@ -118,7 +118,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml
index 9d3db8a31e..24cd207c7f 100644
--- a/llama_stack/templates/remote-vllm/run.yaml
+++ b/llama_stack/templates/remote-vllm/run.yaml
@@ -107,7 +107,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/sambanova/run.yaml b/llama_stack/templates/sambanova/run.yaml
index 39b0f3c4e8..26815dcd06 100644
--- a/llama_stack/templates/sambanova/run.yaml
+++ b/llama_stack/templates/sambanova/run.yaml
@@ -118,7 +118,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml
index ed6c9ef6f2..e1d85f59ac 100644
--- a/llama_stack/templates/tgi/run-with-safety.yaml
+++ b/llama_stack/templates/tgi/run-with-safety.yaml
@@ -106,7 +106,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml
index 8bf76f37b1..fc73e09789 100644
--- a/llama_stack/templates/tgi/run.yaml
+++ b/llama_stack/templates/tgi/run.yaml
@@ -105,7 +105,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/together/run-with-safety.yaml b/llama_stack/templates/together/run-with-safety.yaml
index 2989266307..f101a5d600 100644
--- a/llama_stack/templates/together/run-with-safety.yaml
+++ b/llama_stack/templates/together/run-with-safety.yaml
@@ -159,7 +159,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml
index 920003759c..8af85979d7 100644
--- a/llama_stack/templates/together/run.yaml
+++ b/llama_stack/templates/together/run.yaml
@@ -148,7 +148,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml
index 41a545e1ae..cdce5510d1 100644
--- a/llama_stack/templates/vllm-gpu/run.yaml
+++ b/llama_stack/templates/vllm-gpu/run.yaml
@@ -109,7 +109,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search