From f1844a88c41a08d085c7b5165b714096ad0f5086 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 10 Feb 2025 09:37:21 -0800
Subject: [PATCH 01/31] update eval-tasks -> eval/task

---
 docs/_static/llama-stack-spec.html        | 52 ++++++++++++-----------
 docs/_static/llama-stack-spec.yaml        | 35 +++++++--------
 llama_stack/apis/eval_tasks/eval_tasks.py |  6 +--
 3 files changed, 48 insertions(+), 45 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 151ac14516..84e20f3602 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -728,7 +728,7 @@
                 ]
             }
         },
-        "/v1/eval-tasks/{eval_task_id}": {
+        "/v1/eval/tasks/{task_id}": {
             "get": {
                 "responses": {
                     "200": {
@@ -756,7 +756,7 @@
                 "parameters": [
                     {
                         "name": "eval_task_id",
-                        "in": "path",
+                        "in": "query",
                         "required": true,
                         "schema": {
                             "type": "string"
@@ -1503,7 +1503,7 @@
                 }
             }
         },
-        "/v1/eval-tasks": {
+        "/v1/eval/tasks/": {
             "get": {
                 "responses": {
                     "200": {
@@ -1522,28 +1522,6 @@
                 ],
                 "description": "",
                 "parameters": []
-            },
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "OK"
-                    }
-                },
-                "tags": [
-                    "EvalTasks"
-                ],
-                "description": "",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/RegisterEvalTaskRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
             }
         },
         "/v1/models": {
@@ -2121,6 +2099,30 @@
                 ]
             }
         },
+        "/v1/eval/tasks": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    }
+                },
+                "tags": [
+                    "EvalTasks"
+                ],
+                "description": "",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/RegisterEvalTaskRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
         "/v1/eval/tasks/{task_id}/jobs": {
             "post": {
                 "responses": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 37fba45412..700a3071c8 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -440,7 +440,7 @@ paths:
           required: true
           schema:
             type: string
-  /v1/eval-tasks/{eval_task_id}:
+  /v1/eval/tasks/{task_id}:
     get:
       responses:
         '200':
@@ -456,7 +456,7 @@ paths:
       description: ''
       parameters:
         - name: eval_task_id
-          in: path
+          in: query
           required: true
           schema:
             type: string
@@ -895,7 +895,7 @@ paths:
             schema:
               $ref: '#/components/schemas/RegisterDatasetRequest'
         required: true
-  /v1/eval-tasks:
+  /v1/eval/tasks/:
     get:
       responses:
         '200':
@@ -908,20 +908,6 @@ paths:
         - EvalTasks
       description: ''
       parameters: []
-    post:
-      responses:
-        '200':
-          description: OK
-      tags:
-        - EvalTasks
-      description: ''
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RegisterEvalTaskRequest'
-        required: true
   /v1/models:
     get:
       responses:
@@ -1278,6 +1264,21 @@ paths:
             type: array
             items:
               type: string
+  /v1/eval/tasks:
+    post:
+      responses:
+        '200':
+          description: OK
+      tags:
+        - EvalTasks
+      description: ''
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RegisterEvalTaskRequest'
+        required: true
   /v1/eval/tasks/{task_id}/jobs:
     post:
       responses:
diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py
index a0a5330553..3600589d13 100644
--- a/llama_stack/apis/eval_tasks/eval_tasks.py
+++ b/llama_stack/apis/eval_tasks/eval_tasks.py
@@ -45,16 +45,16 @@ class ListEvalTasksResponse(BaseModel):
 
 @runtime_checkable
 class EvalTasks(Protocol):
-    @webmethod(route="/eval-tasks", method="GET")
+    @webmethod(route="/eval/tasks/", method="GET")
     async def list_eval_tasks(self) -> ListEvalTasksResponse: ...
 
-    @webmethod(route="/eval-tasks/{eval_task_id}", method="GET")
+    @webmethod(route="/eval/tasks/{task_id}", method="GET")
     async def get_eval_task(
         self,
         eval_task_id: str,
     ) -> Optional[EvalTask]: ...
 
-    @webmethod(route="/eval-tasks", method="POST")
+    @webmethod(route="/eval/tasks", method="POST")
     async def register_eval_task(
         self,
         eval_task_id: str,

From 5fe3ddb27d706f2141959f15ccc40cac975cf578 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 10 Feb 2025 09:41:01 -0800
Subject: [PATCH 02/31] update eval_task_id -> task_id

---
 llama_stack/apis/eval_tasks/eval_tasks.py          | 4 ++--
 llama_stack/distribution/routers/routing_tables.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py
index 3600589d13..f36d44c887 100644
--- a/llama_stack/apis/eval_tasks/eval_tasks.py
+++ b/llama_stack/apis/eval_tasks/eval_tasks.py
@@ -51,13 +51,13 @@ async def list_eval_tasks(self) -> ListEvalTasksResponse: ...
     @webmethod(route="/eval/tasks/{task_id}", method="GET")
     async def get_eval_task(
         self,
-        eval_task_id: str,
+        task_id: str,
     ) -> Optional[EvalTask]: ...
 
     @webmethod(route="/eval/tasks", method="POST")
     async def register_eval_task(
         self,
-        eval_task_id: str,
+        task_id: str,
         dataset_id: str,
         scoring_functions: List[str],
         provider_eval_task_id: Optional[str] = None,
diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index 68fafd8ee9..3f5dea66d1 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -432,8 +432,8 @@ class EvalTasksRoutingTable(CommonRoutingTableImpl, EvalTasks):
     async def list_eval_tasks(self) -> ListEvalTasksResponse:
         return ListEvalTasksResponse(data=await self.get_all_with_type("eval_task"))
 
-    async def get_eval_task(self, eval_task_id: str) -> Optional[EvalTask]:
-        return await self.get_object_by_identifier("eval_task", eval_task_id)
+    async def get_eval_task(self, task_id: str) -> Optional[EvalTask]:
+        return await self.get_object_by_identifier("eval_task", task_id)
 
     async def register_eval_task(
         self,

From b11c38ea552c0e5cc0ebefda47e6c775f4556372 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 10 Feb 2025 09:41:21 -0800
Subject: [PATCH 03/31] openapi

---
 docs/_static/llama-stack-spec.html | 8 ++++----
 docs/_static/llama-stack-spec.yaml | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 84e20f3602..3106bff86b 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -755,8 +755,8 @@
                 "description": "",
                 "parameters": [
                     {
-                        "name": "eval_task_id",
-                        "in": "query",
+                        "name": "task_id",
+                        "in": "path",
                         "required": true,
                         "schema": {
                             "type": "string"
@@ -7226,7 +7226,7 @@
             "RegisterEvalTaskRequest": {
                 "type": "object",
                 "properties": {
-                    "eval_task_id": {
+                    "task_id": {
                         "type": "string"
                     },
                     "dataset_id": {
@@ -7272,7 +7272,7 @@
                 },
                 "additionalProperties": false,
                 "required": [
-                    "eval_task_id",
+                    "task_id",
                     "dataset_id",
                     "scoring_functions"
                 ]
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 700a3071c8..9b4220018d 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -455,8 +455,8 @@ paths:
         - EvalTasks
       description: ''
       parameters:
-        - name: eval_task_id
-          in: query
+        - name: task_id
+          in: path
           required: true
           schema:
             type: string
@@ -4599,7 +4599,7 @@ components:
     RegisterEvalTaskRequest:
       type: object
       properties:
-        eval_task_id:
+        task_id:
           type: string
         dataset_id:
           type: string
@@ -4623,7 +4623,7 @@ components:
               - type: object
       additionalProperties: false
       required:
-        - eval_task_id
+        - task_id
         - dataset_id
         - scoring_functions
     RegisterModelRequest:

From e013b9066c2efaefee8c491d441b466a8b379777 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 10 Feb 2025 10:47:28 -0800
Subject: [PATCH 04/31] fix path

---
 docs/_static/llama-stack-spec.html            | 48 +++++++++----------
 docs/_static/llama-stack-spec.yaml            | 31 ++++++------
 llama_stack/apis/eval_tasks/eval_tasks.py     |  6 +--
 .../distribution/routers/routing_tables.py    |  6 +--
 4 files changed, 44 insertions(+), 47 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 3106bff86b..6d3e5b93b4 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -1503,7 +1503,7 @@
                 }
             }
         },
-        "/v1/eval/tasks/": {
+        "/v1/eval/tasks": {
             "get": {
                 "responses": {
                     "200": {
@@ -1522,6 +1522,28 @@
                 ],
                 "description": "",
                 "parameters": []
+            },
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    }
+                },
+                "tags": [
+                    "EvalTasks"
+                ],
+                "description": "",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/RegisterEvalTaskRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
             }
         },
         "/v1/models": {
@@ -2099,30 +2121,6 @@
                 ]
             }
         },
-        "/v1/eval/tasks": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "OK"
-                    }
-                },
-                "tags": [
-                    "EvalTasks"
-                ],
-                "description": "",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/RegisterEvalTaskRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
         "/v1/eval/tasks/{task_id}/jobs": {
             "post": {
                 "responses": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 9b4220018d..aa25c88f8b 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -895,7 +895,7 @@ paths:
             schema:
               $ref: '#/components/schemas/RegisterDatasetRequest'
         required: true
-  /v1/eval/tasks/:
+  /v1/eval/tasks:
     get:
       responses:
         '200':
@@ -908,6 +908,20 @@ paths:
         - EvalTasks
       description: ''
       parameters: []
+    post:
+      responses:
+        '200':
+          description: OK
+      tags:
+        - EvalTasks
+      description: ''
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RegisterEvalTaskRequest'
+        required: true
   /v1/models:
     get:
       responses:
@@ -1264,21 +1278,6 @@ paths:
             type: array
             items:
               type: string
-  /v1/eval/tasks:
-    post:
-      responses:
-        '200':
-          description: OK
-      tags:
-        - EvalTasks
-      description: ''
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RegisterEvalTaskRequest'
-        required: true
   /v1/eval/tasks/{task_id}/jobs:
     post:
       responses:
diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py
index f36d44c887..0a1a27885d 100644
--- a/llama_stack/apis/eval_tasks/eval_tasks.py
+++ b/llama_stack/apis/eval_tasks/eval_tasks.py
@@ -25,7 +25,7 @@ class EvalTask(CommonEvalTaskFields, Resource):
     type: Literal[ResourceType.eval_task.value] = ResourceType.eval_task.value
 
     @property
-    def eval_task_id(self) -> str:
+    def task_id(self) -> str:
         return self.identifier
 
     @property
@@ -34,7 +34,7 @@ def provider_eval_task_id(self) -> str:
 
 
 class EvalTaskInput(CommonEvalTaskFields, BaseModel):
-    eval_task_id: str
+    task_id: str
     provider_id: Optional[str] = None
     provider_eval_task_id: Optional[str] = None
 
@@ -45,7 +45,7 @@ class ListEvalTasksResponse(BaseModel):
 
 @runtime_checkable
 class EvalTasks(Protocol):
-    @webmethod(route="/eval/tasks/", method="GET")
+    @webmethod(route="/eval/tasks", method="GET")
     async def list_eval_tasks(self) -> ListEvalTasksResponse: ...
 
     @webmethod(route="/eval/tasks/{task_id}", method="GET")
diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index 3f5dea66d1..0664e310a5 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -437,7 +437,7 @@ async def get_eval_task(self, task_id: str) -> Optional[EvalTask]:
 
     async def register_eval_task(
         self,
-        eval_task_id: str,
+        task_id: str,
         dataset_id: str,
         scoring_functions: List[str],
         metadata: Optional[Dict[str, Any]] = None,
@@ -454,9 +454,9 @@ async def register_eval_task(
                     "No provider specified and multiple providers available. Please specify a provider_id."
                 )
         if provider_eval_task_id is None:
-            provider_eval_task_id = eval_task_id
+            provider_eval_task_id = task_id
         eval_task = EvalTask(
-            identifier=eval_task_id,
+            identifier=task_id,
             dataset_id=dataset_id,
             scoring_functions=scoring_functions,
             metadata=metadata,

From 79e7253625529d68a4465ef045cece2d95de3125 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 10 Feb 2025 11:21:51 -0800
Subject: [PATCH 05/31] deprecation in OpenAPI spec

---
 docs/_static/llama-stack-spec.html            | 178 ++++++++++--------
 docs/_static/llama-stack-spec.yaml            | 110 ++++++-----
 docs/openapi_generator/pyopenapi/generator.py |   6 +-
 .../pyopenapi/specification.py                |   1 +
 llama_stack/apis/eval_tasks/eval_tasks.py     |   5 +
 5 files changed, 173 insertions(+), 127 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 6d3e5b93b4..459071d3f3 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -40,6 +40,28 @@
         }
     ],
     "paths": {
+        "/v1/eval-tasks": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ListEvalTasksResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "EvalTasks"
+                ],
+                "description": "",
+                "parameters": [],
+                "deprecated": true
+            }
+        },
         "/v1/datasetio/rows": {
             "get": {
                 "responses": {
@@ -2365,6 +2387,84 @@
     "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema",
     "components": {
         "schemas": {
+            "EvalTask": {
+                "type": "object",
+                "properties": {
+                    "identifier": {
+                        "type": "string"
+                    },
+                    "provider_resource_id": {
+                        "type": "string"
+                    },
+                    "provider_id": {
+                        "type": "string"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "eval_task",
+                        "default": "eval_task"
+                    },
+                    "dataset_id": {
+                        "type": "string"
+                    },
+                    "scoring_functions": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "identifier",
+                    "provider_resource_id",
+                    "provider_id",
+                    "type",
+                    "dataset_id",
+                    "scoring_functions",
+                    "metadata"
+                ]
+            },
+            "ListEvalTasksResponse": {
+                "type": "object",
+                "properties": {
+                    "data": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/EvalTask"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "data"
+                ]
+            },
             "AppendRowsRequest": {
                 "type": "object",
                 "properties": {
@@ -5208,69 +5308,6 @@
                     "type"
                 ]
             },
-            "EvalTask": {
-                "type": "object",
-                "properties": {
-                    "identifier": {
-                        "type": "string"
-                    },
-                    "provider_resource_id": {
-                        "type": "string"
-                    },
-                    "provider_id": {
-                        "type": "string"
-                    },
-                    "type": {
-                        "type": "string",
-                        "const": "eval_task",
-                        "default": "eval_task"
-                    },
-                    "dataset_id": {
-                        "type": "string"
-                    },
-                    "scoring_functions": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        }
-                    },
-                    "metadata": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "identifier",
-                    "provider_resource_id",
-                    "provider_id",
-                    "type",
-                    "dataset_id",
-                    "scoring_functions",
-                    "metadata"
-                ]
-            },
             "Model": {
                 "type": "object",
                 "properties": {
@@ -6164,21 +6201,6 @@
                     "data"
                 ]
             },
-            "ListEvalTasksResponse": {
-                "type": "object",
-                "properties": {
-                    "data": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/EvalTask"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "data"
-                ]
-            },
             "ListModelsResponse": {
                 "type": "object",
                 "properties": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index aa25c88f8b..b37f09ef88 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -10,6 +10,20 @@ info:
 servers:
   - url: http://any-hosted-llama-stack.com
 paths:
+  /v1/eval-tasks:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ListEvalTasksResponse'
+      tags:
+        - EvalTasks
+      description: ''
+      parameters: []
+      deprecated: true
   /v1/datasetio/rows:
     get:
       responses:
@@ -1429,6 +1443,54 @@ jsonSchemaDialect: >-
   https://json-schema.org/draft/2020-12/schema
 components:
   schemas:
+    EvalTask:
+      type: object
+      properties:
+        identifier:
+          type: string
+        provider_resource_id:
+          type: string
+        provider_id:
+          type: string
+        type:
+          type: string
+          const: eval_task
+          default: eval_task
+        dataset_id:
+          type: string
+        scoring_functions:
+          type: array
+          items:
+            type: string
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - identifier
+        - provider_resource_id
+        - provider_id
+        - type
+        - dataset_id
+        - scoring_functions
+        - metadata
+    ListEvalTasksResponse:
+      type: object
+      properties:
+        data:
+          type: array
+          items:
+            $ref: '#/components/schemas/EvalTask'
+      additionalProperties: false
+      required:
+        - data
     AppendRowsRequest:
       type: object
       properties:
@@ -3354,44 +3416,6 @@ components:
       additionalProperties: false
       required:
         - type
-    EvalTask:
-      type: object
-      properties:
-        identifier:
-          type: string
-        provider_resource_id:
-          type: string
-        provider_id:
-          type: string
-        type:
-          type: string
-          const: eval_task
-          default: eval_task
-        dataset_id:
-          type: string
-        scoring_functions:
-          type: array
-          items:
-            type: string
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-      additionalProperties: false
-      required:
-        - identifier
-        - provider_resource_id
-        - provider_id
-        - type
-        - dataset_id
-        - scoring_functions
-        - metadata
     Model:
       type: object
       properties:
@@ -3930,16 +3954,6 @@ components:
       additionalProperties: false
       required:
         - data
-    ListEvalTasksResponse:
-      type: object
-      properties:
-        data:
-          type: array
-          items:
-            $ref: '#/components/schemas/EvalTask'
-      additionalProperties: false
-      required:
-        - data
     ListModelsResponse:
       type: object
       properties:
diff --git a/docs/openapi_generator/pyopenapi/generator.py b/docs/openapi_generator/pyopenapi/generator.py
index f0d30a0e65..86db8a06d9 100644
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@@ -644,7 +644,10 @@ def _build_operation(self, op: EndpointOperation) -> Operation:
         else:
             callbacks = None
 
-        description = "\n".join(filter(None, [doc_string.short_description, doc_string.long_description]))
+        description = "\n".join(
+            filter(None, [doc_string.short_description, doc_string.long_description])
+        )
+
         return Operation(
             tags=[op.defining_class.__name__],
             summary=None,
@@ -654,6 +657,7 @@ def _build_operation(self, op: EndpointOperation) -> Operation:
             requestBody=requestBody,
             responses=responses,
             callbacks=callbacks,
+            deprecated=True if "DEPRECATED" in op.func_name else None,
             security=[] if op.public else None,
         )
 
diff --git a/docs/openapi_generator/pyopenapi/specification.py b/docs/openapi_generator/pyopenapi/specification.py
index 4b54295c56..f96de58b69 100644
--- a/docs/openapi_generator/pyopenapi/specification.py
+++ b/docs/openapi_generator/pyopenapi/specification.py
@@ -117,6 +117,7 @@ class Operation:
     requestBody: Optional[RequestBody] = None
     callbacks: Optional[Dict[str, "Callback"]] = None
     security: Optional[List["SecurityRequirement"]] = None
+    deprecated: Optional[bool] = None
 
 
 @dataclass
diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py
index 0a1a27885d..9a26fd0c0d 100644
--- a/llama_stack/apis/eval_tasks/eval_tasks.py
+++ b/llama_stack/apis/eval_tasks/eval_tasks.py
@@ -45,6 +45,11 @@ class ListEvalTasksResponse(BaseModel):
 
 @runtime_checkable
 class EvalTasks(Protocol):
+    @webmethod(route="/eval-tasks", method="GET")
+    async def DEPRECATED_list_eval_tasks_deprecated(
+        self,
+    ) -> ListEvalTasksResponse: ...
+
     @webmethod(route="/eval/tasks", method="GET")
     async def list_eval_tasks(self) -> ListEvalTasksResponse: ...
 

From 65ffcddd84269330e2921784616442056c7c2453 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 10 Feb 2025 11:35:21 -0800
Subject: [PATCH 06/31] deprecation

---
 docs/_static/llama-stack-spec.html            | 115 ++++++++++++++++++
 docs/_static/llama-stack-spec.yaml            |  66 ++++++++++
 llama_stack/apis/eval_tasks/eval_tasks.py     |  25 +++-
 .../distribution/routers/routing_tables.py    |  20 +++
 4 files changed, 221 insertions(+), 5 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 459071d3f3..188ba96a4a 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -40,6 +40,44 @@
         }
     ],
     "paths": {
+        "/v1/eval-tasks/{eval_task_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/EvalTask"
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "EvalTasks"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "eval_task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "deprecated": true
+            }
+        },
         "/v1/eval-tasks": {
             "get": {
                 "responses": {
@@ -60,6 +98,29 @@
                 "description": "",
                 "parameters": [],
                 "deprecated": true
+            },
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    }
+                },
+                "tags": [
+                    "EvalTasks"
+                ],
+                "description": "",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/DeprecatedRegisterEvalTaskRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                },
+                "deprecated": true
             }
         },
         "/v1/datasetio/rows": {
@@ -2465,6 +2526,60 @@
                     "data"
                 ]
             },
+            "DeprecatedRegisterEvalTaskRequest": {
+                "type": "object",
+                "properties": {
+                    "eval_task_id": {
+                        "type": "string"
+                    },
+                    "dataset_id": {
+                        "type": "string"
+                    },
+                    "scoring_functions": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    },
+                    "provider_eval_task_id": {
+                        "type": "string"
+                    },
+                    "provider_id": {
+                        "type": "string"
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "eval_task_id",
+                    "dataset_id",
+                    "scoring_functions"
+                ]
+            },
             "AppendRowsRequest": {
                 "type": "object",
                 "properties": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index b37f09ef88..ed5b71d0d8 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -10,6 +10,27 @@ info:
 servers:
   - url: http://any-hosted-llama-stack.com
 paths:
+  /v1/eval-tasks/{eval_task_id}:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: '#/components/schemas/EvalTask'
+                  - type: 'null'
+      tags:
+        - EvalTasks
+      description: ''
+      parameters:
+        - name: eval_task_id
+          in: path
+          required: true
+          schema:
+            type: string
+      deprecated: true
   /v1/eval-tasks:
     get:
       responses:
@@ -24,6 +45,21 @@ paths:
       description: ''
       parameters: []
       deprecated: true
+    post:
+      responses:
+        '200':
+          description: OK
+      tags:
+        - EvalTasks
+      description: ''
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/DeprecatedRegisterEvalTaskRequest'
+        required: true
+      deprecated: true
   /v1/datasetio/rows:
     get:
       responses:
@@ -1491,6 +1527,36 @@ components:
       additionalProperties: false
       required:
         - data
+    DeprecatedRegisterEvalTaskRequest:
+      type: object
+      properties:
+        eval_task_id:
+          type: string
+        dataset_id:
+          type: string
+        scoring_functions:
+          type: array
+          items:
+            type: string
+        provider_eval_task_id:
+          type: string
+        provider_id:
+          type: string
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - eval_task_id
+        - dataset_id
+        - scoring_functions
     AppendRowsRequest:
       type: object
       properties:
diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py
index 9a26fd0c0d..6d12fd2f7f 100644
--- a/llama_stack/apis/eval_tasks/eval_tasks.py
+++ b/llama_stack/apis/eval_tasks/eval_tasks.py
@@ -45,11 +45,6 @@ class ListEvalTasksResponse(BaseModel):
 
 @runtime_checkable
 class EvalTasks(Protocol):
-    @webmethod(route="/eval-tasks", method="GET")
-    async def DEPRECATED_list_eval_tasks_deprecated(
-        self,
-    ) -> ListEvalTasksResponse: ...
-
     @webmethod(route="/eval/tasks", method="GET")
     async def list_eval_tasks(self) -> ListEvalTasksResponse: ...
 
@@ -69,3 +64,23 @@ async def register_eval_task(
         provider_id: Optional[str] = None,
         metadata: Optional[Dict[str, Any]] = None,
     ) -> None: ...
+
+    @webmethod(route="/eval-tasks", method="GET")
+    async def DEPRECATED_list_eval_tasks(self) -> ListEvalTasksResponse: ...
+
+    @webmethod(route="/eval-tasks/{eval_task_id}", method="GET")
+    async def DEPRECATED_get_eval_task(
+        self,
+        eval_task_id: str,
+    ) -> Optional[EvalTask]: ...
+
+    @webmethod(route="/eval-tasks", method="POST")
+    async def DEPRECATED_register_eval_task(
+        self,
+        eval_task_id: str,
+        dataset_id: str,
+        scoring_functions: List[str],
+        provider_eval_task_id: Optional[str] = None,
+        provider_id: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> None: ...
diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index 0664e310a5..98e3afd3ff 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -465,6 +465,26 @@ async def register_eval_task(
         )
         await self.register_object(eval_task)
 
+    async def DEPRECATED_list_eval_tasks(self) -> ListEvalTasksResponse:
+        raise DeprecationWarning("Use /eval/tasks instead")
+
+    async def DEPRECATED_get_eval_task(
+        self,
+        eval_task_id: str,
+    ) -> Optional[EvalTask]:
+        raise DeprecationWarning("Use /eval/tasks instead")
+
+    async def DEPRECATED_register_eval_task(
+        self,
+        eval_task_id: str,
+        dataset_id: str,
+        scoring_functions: List[str],
+        provider_eval_task_id: Optional[str] = None,
+        provider_id: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        raise DeprecationWarning("Use /eval/tasks instead")
+
 
 class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
     async def list_tools(self, toolgroup_id: Optional[str] = None) -> ListToolsResponse:

From 9a8f4025c1296b02db74fdd48a9d2ac6afe77348 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Wed, 12 Feb 2025 20:29:36 -0800
Subject: [PATCH 07/31] naming update

---
 docs/_static/llama-stack-spec.html            | 72 +++++++++----------
 docs/_static/llama-stack-spec.yaml            | 72 +++++++++----------
 docs/getting_started.ipynb                    |  4 +-
 .../Llama_Stack_Benchmark_Evals.ipynb         | 12 ++--
 docs/source/building_applications/evals.md    |  8 +--
 .../building_applications/evaluation.md       |  4 +-
 docs/source/concepts/evaluation_concepts.md   |  4 +-
 docs/source/concepts/index.md                 |  2 +-
 docs/source/playground/index.md               |  4 +-
 .../references/evals_reference/index.md       | 24 +++----
 .../llama_stack_client_cli_reference.md       | 10 +--
 .../references/python_sdk_reference/index.md  | 14 ++--
 llama_stack/apis/datatypes.py                 |  2 +-
 llama_stack/apis/eval/eval.py                 | 14 ++--
 llama_stack/apis/eval_tasks/__init__.py       |  2 +-
 llama_stack/apis/eval_tasks/eval_tasks.py     | 44 ++++++------
 llama_stack/apis/resource.py                  |  2 +-
 llama_stack/distribution/datatypes.py         |  9 +--
 llama_stack/distribution/distribution.py      |  2 +-
 llama_stack/distribution/resolver.py          |  8 +--
 llama_stack/distribution/routers/__init__.py  |  4 +-
 llama_stack/distribution/routers/routers.py   |  8 +--
 .../distribution/routers/routing_tables.py    | 49 ++++++-------
 llama_stack/distribution/stack.py             |  6 +-
 llama_stack/distribution/ui/README.md         |  2 +-
 .../ui/page/distribution/eval_tasks.py        | 10 +--
 .../ui/page/distribution/resources.py         |  4 +-
 .../ui/page/evaluations/native_eval.py        | 44 ++++++------
 llama_stack/providers/datatypes.py            |  7 +-
 .../inline/eval/meta_reference/eval.py        | 36 +++++-----
 llama_stack/providers/tests/eval/test_eval.py | 42 +++++------
 llama_stack/providers/tests/resolver.py       |  7 +-
 llama_stack/templates/bedrock/run.yaml        |  2 +-
 llama_stack/templates/cerebras/run.yaml       |  2 +-
 .../templates/dell/run-with-safety.yaml       |  2 +-
 llama_stack/templates/dell/run.yaml           |  2 +-
 .../experimental-post-training/run.yaml       |  2 +-
 .../templates/fireworks/run-with-safety.yaml  |  2 +-
 llama_stack/templates/fireworks/run.yaml      |  2 +-
 .../hf-endpoint/run-with-safety.yaml          |  2 +-
 llama_stack/templates/hf-endpoint/run.yaml    |  2 +-
 .../hf-serverless/run-with-safety.yaml        |  2 +-
 llama_stack/templates/hf-serverless/run.yaml  |  2 +-
 .../meta-reference-gpu/run-with-safety.yaml   |  2 +-
 .../templates/meta-reference-gpu/run.yaml     |  2 +-
 .../meta-reference-quantized-gpu/run.yaml     |  2 +-
 llama_stack/templates/nvidia/run.yaml         |  2 +-
 .../templates/ollama/run-with-safety.yaml     |  2 +-
 llama_stack/templates/ollama/run.yaml         |  2 +-
 .../remote-vllm/run-with-safety.yaml          |  2 +-
 llama_stack/templates/remote-vllm/run.yaml    |  2 +-
 llama_stack/templates/sambanova/run.yaml      |  2 +-
 .../templates/tgi/run-with-safety.yaml        |  2 +-
 llama_stack/templates/tgi/run.yaml            |  2 +-
 .../templates/together/run-with-safety.yaml   |  2 +-
 llama_stack/templates/together/run.yaml       |  2 +-
 llama_stack/templates/vllm-gpu/run.yaml       |  2 +-
 57 files changed, 293 insertions(+), 289 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 188ba96a4a..84c6fd99df 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -40,7 +40,7 @@
         }
     ],
     "paths": {
-        "/v1/eval-tasks/{eval_task_id}": {
+        "/v1/eval-tasks/{benchmark_id}": {
             "get": {
                 "responses": {
                     "200": {
@@ -50,7 +50,7 @@
                                 "schema": {
                                     "oneOf": [
                                         {
-                                            "$ref": "#/components/schemas/EvalTask"
+                                            "$ref": "#/components/schemas/Benchmark"
                                         },
                                         {
                                             "type": "null"
@@ -62,12 +62,12 @@
                     }
                 },
                 "tags": [
-                    "EvalTasks"
+                    "Benchmarks"
                 ],
                 "description": "",
                 "parameters": [
                     {
-                        "name": "eval_task_id",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -86,14 +86,14 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/ListEvalTasksResponse"
+                                    "$ref": "#/components/schemas/ListBenchmarksResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "EvalTasks"
+                    "Benchmarks"
                 ],
                 "description": "",
                 "parameters": [],
@@ -106,7 +106,7 @@
                     }
                 },
                 "tags": [
-                    "EvalTasks"
+                    "Benchmarks"
                 ],
                 "description": "",
                 "parameters": [],
@@ -114,7 +114,7 @@
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/DeprecatedRegisterEvalTaskRequest"
+                                "$ref": "#/components/schemas/DeprecatedRegisterBenchmarkRequest"
                             }
                         }
                     },
@@ -821,7 +821,7 @@
                                 "schema": {
                                     "oneOf": [
                                         {
-                                            "$ref": "#/components/schemas/EvalTask"
+                                            "$ref": "#/components/schemas/Benchmark"
                                         },
                                         {
                                             "type": "null"
@@ -833,7 +833,7 @@
                     }
                 },
                 "tags": [
-                    "EvalTasks"
+                    "Benchmarks"
                 ],
                 "description": "",
                 "parameters": [
@@ -1594,14 +1594,14 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/ListEvalTasksResponse"
+                                    "$ref": "#/components/schemas/ListBenchmarksResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "EvalTasks"
+                    "Benchmarks"
                 ],
                 "description": "",
                 "parameters": []
@@ -1613,7 +1613,7 @@
                     }
                 },
                 "tags": [
-                    "EvalTasks"
+                    "Benchmarks"
                 ],
                 "description": "",
                 "parameters": [],
@@ -1621,7 +1621,7 @@
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/RegisterEvalTaskRequest"
+                                "$ref": "#/components/schemas/RegisterBenchmarkRequest"
                             }
                         }
                     },
@@ -2448,7 +2448,7 @@
     "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema",
     "components": {
         "schemas": {
-            "EvalTask": {
+            "Benchmark": {
                 "type": "object",
                 "properties": {
                     "identifier": {
@@ -2462,8 +2462,8 @@
                     },
                     "type": {
                         "type": "string",
-                        "const": "eval_task",
-                        "default": "eval_task"
+                        "const": "benchmark",
+                        "default": "benchmark"
                     },
                     "dataset_id": {
                         "type": "string"
@@ -2511,13 +2511,13 @@
                     "metadata"
                 ]
             },
-            "ListEvalTasksResponse": {
+            "ListBenchmarksResponse": {
                 "type": "object",
                 "properties": {
                     "data": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/EvalTask"
+                            "$ref": "#/components/schemas/Benchmark"
                         }
                     }
                 },
@@ -2526,10 +2526,10 @@
                     "data"
                 ]
             },
-            "DeprecatedRegisterEvalTaskRequest": {
+            "DeprecatedRegisterBenchmarkRequest": {
                 "type": "object",
                 "properties": {
-                    "eval_task_id": {
+                    "benchmark_id": {
                         "type": "string"
                     },
                     "dataset_id": {
@@ -2541,7 +2541,7 @@
                             "type": "string"
                         }
                     },
-                    "provider_eval_task_id": {
+                    "provider_benchmark_id": {
                         "type": "string"
                     },
                     "provider_id": {
@@ -2575,7 +2575,7 @@
                 },
                 "additionalProperties": false,
                 "required": [
-                    "eval_task_id",
+                    "benchmark_id",
                     "dataset_id",
                     "scoring_functions"
                 ]
@@ -4745,7 +4745,7 @@
                     "accuracy"
                 ]
             },
-            "AppEvalTaskConfig": {
+            "AppBenchmarkConfig": {
                 "type": "object",
                 "properties": {
                     "type": {
@@ -4793,7 +4793,7 @@
                     "type"
                 ]
             },
-            "BenchmarkEvalTaskConfig": {
+            "BenchmarkBenchmarkConfig": {
                 "type": "object",
                 "properties": {
                     "type": {
@@ -4831,20 +4831,20 @@
                     }
                 }
             },
-            "EvalTaskConfig": {
+            "BenchmarkConfig": {
                 "oneOf": [
                     {
-                        "$ref": "#/components/schemas/BenchmarkEvalTaskConfig"
+                        "$ref": "#/components/schemas/BenchmarkBenchmarkConfig"
                     },
                     {
-                        "$ref": "#/components/schemas/AppEvalTaskConfig"
+                        "$ref": "#/components/schemas/AppBenchmarkConfig"
                     }
                 ],
                 "discriminator": {
                     "propertyName": "type",
                     "mapping": {
-                        "benchmark": "#/components/schemas/BenchmarkEvalTaskConfig",
-                        "app": "#/components/schemas/AppEvalTaskConfig"
+                        "benchmark": "#/components/schemas/BenchmarkBenchmarkConfig",
+                        "app": "#/components/schemas/AppBenchmarkConfig"
                     }
                 }
             },
@@ -4991,7 +4991,7 @@
                         }
                     },
                     "task_config": {
-                        "$ref": "#/components/schemas/EvalTaskConfig"
+                        "$ref": "#/components/schemas/BenchmarkConfig"
                     }
                 },
                 "additionalProperties": false,
@@ -7358,7 +7358,7 @@
                     "url"
                 ]
             },
-            "RegisterEvalTaskRequest": {
+            "RegisterBenchmarkRequest": {
                 "type": "object",
                 "properties": {
                     "task_id": {
@@ -7373,7 +7373,7 @@
                             "type": "string"
                         }
                     },
-                    "provider_eval_task_id": {
+                    "provider_benchmark_id": {
                         "type": "string"
                     },
                     "provider_id": {
@@ -7603,7 +7603,7 @@
                 "type": "object",
                 "properties": {
                     "task_config": {
-                        "$ref": "#/components/schemas/EvalTaskConfig"
+                        "$ref": "#/components/schemas/BenchmarkConfig"
                     }
                 },
                 "additionalProperties": false,
@@ -8115,7 +8115,7 @@
             "name": "Eval"
         },
         {
-            "name": "EvalTasks"
+            "name": "Benchmarks"
         },
         {
             "name": "Inference",
@@ -8171,7 +8171,7 @@
                 "DatasetIO",
                 "Datasets",
                 "Eval",
-                "EvalTasks",
+                "Benchmarks",
                 "Inference",
                 "Inspect",
                 "Models",
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index ed5b71d0d8..dd0951fdec 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -10,7 +10,7 @@ info:
 servers:
   - url: http://any-hosted-llama-stack.com
 paths:
-  /v1/eval-tasks/{eval_task_id}:
+  /v1/eval-tasks/{benchmark_id}:
     get:
       responses:
         '200':
@@ -19,13 +19,13 @@ paths:
             application/json:
               schema:
                 oneOf:
-                  - $ref: '#/components/schemas/EvalTask'
+                  - $ref: '#/components/schemas/Benchmark'
                   - type: 'null'
       tags:
-        - EvalTasks
+        - Benchmarks
       description: ''
       parameters:
-        - name: eval_task_id
+        - name: benchmark_id
           in: path
           required: true
           schema:
@@ -39,9 +39,9 @@ paths:
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/ListEvalTasksResponse'
+                $ref: '#/components/schemas/ListBenchmarksResponse'
       tags:
-        - EvalTasks
+        - Benchmarks
       description: ''
       parameters: []
       deprecated: true
@@ -50,14 +50,14 @@ paths:
         '200':
           description: OK
       tags:
-        - EvalTasks
+        - Benchmarks
       description: ''
       parameters: []
       requestBody:
         content:
           application/json:
             schema:
-              $ref: '#/components/schemas/DeprecatedRegisterEvalTaskRequest'
+              $ref: '#/components/schemas/DeprecatedRegisterBenchmarkRequest'
         required: true
       deprecated: true
   /v1/datasetio/rows:
@@ -499,10 +499,10 @@ paths:
             application/json:
               schema:
                 oneOf:
-                  - $ref: '#/components/schemas/EvalTask'
+                  - $ref: '#/components/schemas/Benchmark'
                   - type: 'null'
       tags:
-        - EvalTasks
+        - Benchmarks
       description: ''
       parameters:
         - name: task_id
@@ -953,9 +953,9 @@ paths:
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/ListEvalTasksResponse'
+                $ref: '#/components/schemas/ListBenchmarksResponse'
       tags:
-        - EvalTasks
+        - Benchmarks
       description: ''
       parameters: []
     post:
@@ -963,14 +963,14 @@ paths:
         '200':
           description: OK
       tags:
-        - EvalTasks
+        - Benchmarks
       description: ''
       parameters: []
       requestBody:
         content:
           application/json:
             schema:
-              $ref: '#/components/schemas/RegisterEvalTaskRequest'
+              $ref: '#/components/schemas/RegisterBenchmarkRequest'
         required: true
   /v1/models:
     get:
@@ -1479,7 +1479,7 @@ jsonSchemaDialect: >-
   https://json-schema.org/draft/2020-12/schema
 components:
   schemas:
-    EvalTask:
+    Benchmark:
       type: object
       properties:
         identifier:
@@ -1490,8 +1490,8 @@ components:
           type: string
         type:
           type: string
-          const: eval_task
-          default: eval_task
+          const: benchmark
+          default: benchmark
         dataset_id:
           type: string
         scoring_functions:
@@ -1517,20 +1517,20 @@ components:
         - dataset_id
         - scoring_functions
         - metadata
-    ListEvalTasksResponse:
+    ListBenchmarksResponse:
       type: object
       properties:
         data:
           type: array
           items:
-            $ref: '#/components/schemas/EvalTask'
+            $ref: '#/components/schemas/Benchmark'
       additionalProperties: false
       required:
         - data
-    DeprecatedRegisterEvalTaskRequest:
+    DeprecatedRegisterBenchmarkRequest:
       type: object
       properties:
-        eval_task_id:
+        benchmark_id:
           type: string
         dataset_id:
           type: string
@@ -1538,7 +1538,7 @@ components:
           type: array
           items:
             type: string
-        provider_eval_task_id:
+        provider_benchmark_id:
           type: string
         provider_id:
           type: string
@@ -1554,7 +1554,7 @@ components:
               - type: object
       additionalProperties: false
       required:
-        - eval_task_id
+        - benchmark_id
         - dataset_id
         - scoring_functions
     AppendRowsRequest:
@@ -3063,7 +3063,7 @@ components:
         - median
         - categorical_count
         - accuracy
-    AppEvalTaskConfig:
+    AppBenchmarkConfig:
       type: object
       properties:
         type:
@@ -3097,7 +3097,7 @@ components:
       additionalProperties: false
       required:
         - type
-    BenchmarkEvalTaskConfig:
+    BenchmarkBenchmarkConfig:
       type: object
       properties:
         type:
@@ -3121,15 +3121,15 @@ components:
         mapping:
           model: '#/components/schemas/ModelCandidate'
           agent: '#/components/schemas/AgentCandidate'
-    EvalTaskConfig:
+    BenchmarkConfig:
       oneOf:
-        - $ref: '#/components/schemas/BenchmarkEvalTaskConfig'
-        - $ref: '#/components/schemas/AppEvalTaskConfig'
+        - $ref: '#/components/schemas/BenchmarkBenchmarkConfig'
+        - $ref: '#/components/schemas/AppBenchmarkConfig'
       discriminator:
         propertyName: type
         mapping:
-          benchmark: '#/components/schemas/BenchmarkEvalTaskConfig'
-          app: '#/components/schemas/AppEvalTaskConfig'
+          benchmark: '#/components/schemas/BenchmarkBenchmarkConfig'
+          app: '#/components/schemas/AppBenchmarkConfig'
     LLMAsJudgeScoringFnParams:
       type: object
       properties:
@@ -3220,7 +3220,7 @@ components:
           items:
             type: string
         task_config:
-          $ref: '#/components/schemas/EvalTaskConfig'
+          $ref: '#/components/schemas/BenchmarkConfig'
       additionalProperties: false
       required:
         - input_rows
@@ -4675,7 +4675,7 @@ components:
         - dataset_id
         - dataset_schema
         - url
-    RegisterEvalTaskRequest:
+    RegisterBenchmarkRequest:
       type: object
       properties:
         task_id:
@@ -4686,7 +4686,7 @@ components:
           type: array
           items:
             type: string
-        provider_eval_task_id:
+        provider_benchmark_id:
           type: string
         provider_id:
           type: string
@@ -4815,7 +4815,7 @@ components:
       type: object
       properties:
         task_config:
-          $ref: '#/components/schemas/EvalTaskConfig'
+          $ref: '#/components/schemas/BenchmarkConfig'
       additionalProperties: false
       required:
         - task_config
@@ -5128,7 +5128,7 @@ tags:
   - name: DatasetIO
   - name: Datasets
   - name: Eval
-  - name: EvalTasks
+  - name: Benchmarks
   - name: Inference
     description: >-
       This API provides the raw interface to the underlying models. Two kinds of models
@@ -5162,7 +5162,7 @@ x-tagGroups:
       - DatasetIO
       - Datasets
       - Eval
-      - EvalTasks
+      - Benchmarks
       - Inference
       - Inspect
       - Models
diff --git a/docs/getting_started.ipynb b/docs/getting_started.ipynb
index abe537c8e1..ee616b4716 100644
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
@@ -324,7 +324,7 @@
               "- vector_io\n",
               "container_image: null\n",
               "datasets: <span style=\"font-weight: bold\">[]</span>\n",
-              "eval_tasks: <span style=\"font-weight: bold\">[]</span>\n",
+              "benchmarks: <span style=\"font-weight: bold\">[]</span>\n",
               "image_name: together\n",
               "metadata_store:\n",
               "  db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/ashwin/.llama/distributions/together/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">registry.db</span>\n",
@@ -508,7 +508,7 @@
               "- vector_io\n",
               "container_image: null\n",
               "datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
-              "eval_tasks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
+              "benchmarks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
               "image_name: together\n",
               "metadata_store:\n",
               "  db_path: \u001b[35m/Users/ashwin/.llama/distributions/together/\u001b[0m\u001b[95mregistry.db\u001b[0m\n",
diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
index 84da252469..6e8480f945 100644
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@@ -370,7 +370,7 @@
               "- tool_runtime\n",
               "datasets: <span style=\"font-weight: bold\">[]</span>\n",
               "container_image: null\n",
-              "eval_tasks: <span style=\"font-weight: bold\">[]</span>\n",
+              "benchmarks: <span style=\"font-weight: bold\">[]</span>\n",
               "image_name: together\n",
               "memory_banks: <span style=\"font-weight: bold\">[]</span>\n",
               "metadata_store:\n",
@@ -551,7 +551,7 @@
               "- tool_runtime\n",
               "datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
               "container_image: null\n",
-              "eval_tasks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
+              "benchmarks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
               "image_name: together\n",
               "memory_banks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
               "metadata_store:\n",
@@ -1017,8 +1017,8 @@
         "    \"content\": SYSTEM_PROMPT_TEMPLATE.format(subject=subset),\n",
         "}\n",
         "\n",
-        "client.eval_tasks.register(\n",
-        "    eval_task_id=\"meta-reference::mmmu\",\n",
+        "client.benchmarks.register(\n",
+        "    benchmark_id=\"meta-reference::mmmu\",\n",
         "    dataset_id=f\"mmmu-{subset}-{split}\",\n",
         "    scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
         ")\n",
@@ -1196,8 +1196,8 @@
         "    provider_id=\"together\",\n",
         ")\n",
         "\n",
-        "client.eval_tasks.register(\n",
-        "    eval_task_id=\"meta-reference::simpleqa\",\n",
+        "client.benchmarks.register(\n",
+        "    benchmark_id=\"meta-reference::simpleqa\",\n",
         "    dataset_id=simpleqa_dataset_id,\n",
         "    scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
         ")\n",
diff --git a/docs/source/building_applications/evals.md b/docs/source/building_applications/evals.md
index c4cb476e4f..c1c371ca80 100644
--- a/docs/source/building_applications/evals.md
+++ b/docs/source/building_applications/evals.md
@@ -41,8 +41,8 @@ system_message = {
     "content": SYSTEM_PROMPT_TEMPLATE,
 }
 
-client.eval_tasks.register(
-    eval_task_id="meta-reference::mmmu",
+client.benchmarks.register(
+    benchmark_id="meta-reference::mmmu",
     dataset_id=f"mmmu-{subset}-{split}",
     scoring_functions=["basic::regex_parser_multiple_choice_answer"],
 )
@@ -99,8 +99,8 @@ eval_rows = client.datasetio.get_rows_paginated(
 ```
 
 ```python
-client.eval_tasks.register(
-    eval_task_id="meta-reference::simpleqa",
+client.benchmarks.register(
+    benchmark_id="meta-reference::simpleqa",
     dataset_id=simpleqa_dataset_id,
     scoring_functions=["llm-as-judge::405b-simpleqa"],
 )
diff --git a/docs/source/building_applications/evaluation.md b/docs/source/building_applications/evaluation.md
index 91e5c552bd..df18c146cc 100644
--- a/docs/source/building_applications/evaluation.md
+++ b/docs/source/building_applications/evaluation.md
@@ -10,8 +10,8 @@ Here's how to set up basic evaluation:
 
 ```python
 # Create an evaluation task
-response = client.eval_tasks.register(
-    eval_task_id="my_eval",
+response = client.benchmarks.register(
+    benchmark_id="my_eval",
     dataset_id="my_dataset",
     scoring_functions=["accuracy", "relevance"],
 )
diff --git a/docs/source/concepts/evaluation_concepts.md b/docs/source/concepts/evaluation_concepts.md
index 399d99d92d..3ca4b0ac8e 100644
--- a/docs/source/concepts/evaluation_concepts.md
+++ b/docs/source/concepts/evaluation_concepts.md
@@ -5,7 +5,7 @@ The Llama Stack Evaluation flow allows you to run evaluations on your GenAI appl
 We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications.
 - `/datasetio` + `/datasets` API
 - `/scoring` + `/scoring_functions` API
-- `/eval` + `/eval_tasks` API
+- `/eval` + `/benchmarks` API
 
 This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).
 
@@ -21,7 +21,7 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo
 - **Scoring**: evaluate outputs of the system.
   - Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics.
 - **Eval**: generate outputs (via Inference or Agents) and perform scoring.
-  - Associated with `EvalTask` resource.
+  - Associated with `Benchmark` resource.
 
 
 Use the following decision tree to decide how to use LlamaStack Evaluation flow.
diff --git a/docs/source/concepts/index.md b/docs/source/concepts/index.md
index 1437ec6232..403e47c489 100644
--- a/docs/source/concepts/index.md
+++ b/docs/source/concepts/index.md
@@ -42,7 +42,7 @@ Some of these APIs are associated with a set of **Resources**. Here is the mappi
 - **Tool Runtime** is associated with `ToolGroup` resources.
 - **DatasetIO** is associated with `Dataset` resources.
 - **Scoring** is associated with `ScoringFunction` resources.
-- **Eval** is associated with `Model` and `EvalTask` resources.
+- **Eval** is associated with `Model` and `Benchmark` resources.
 
 Furthermore, we allow these resources to be **federated** across multiple providers. For example, you may have some Llama models served by Fireworks while others are served by AWS Bedrock. Regardless, they will all work seamlessly with the same uniform Inference API provided by Llama Stack.
 
diff --git a/docs/source/playground/index.md b/docs/source/playground/index.md
index d74bf1a03b..9691609abf 100644
--- a/docs/source/playground/index.md
+++ b/docs/source/playground/index.md
@@ -64,7 +64,7 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie
     ```
 
     ```bash
-    $ llama-stack-client eval_tasks register \
+    $ llama-stack-client benchmarks register \
     --eval-task-id meta-reference-mmlu \
     --provider-id meta-reference \
     --dataset-id mmlu \
@@ -86,7 +86,7 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie
   - Under the hood, it uses Llama Stack's `/providers` API to get information about the providers.
 
 - **API Resources**: Inspect Llama Stack API resources
-  - This page allows you to inspect Llama Stack API resources (`models`, `datasets`, `memory_banks`, `eval_tasks`, `shields`).
+  - This page allows you to inspect Llama Stack API resources (`models`, `datasets`, `memory_banks`, `benchmarks`, `shields`).
   - Under the hood, it uses Llama Stack's `/<resources>/list` API to get information about each resources.
   - Please visit [Core Concepts](https://llama-stack.readthedocs.io/en/latest/concepts/index.html) for more details about the resources.
 
diff --git a/docs/source/references/evals_reference/index.md b/docs/source/references/evals_reference/index.md
index 86f66208af..f0275511df 100644
--- a/docs/source/references/evals_reference/index.md
+++ b/docs/source/references/evals_reference/index.md
@@ -5,7 +5,7 @@ The Llama Stack Evaluation flow allows you to run evaluations on your GenAI appl
 We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications.
 - `/datasetio` + `/datasets` API
 - `/scoring` + `/scoring_functions` API
-- `/eval` + `/eval_tasks` API
+- `/eval` + `/benchmarks` API
 
 This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).
 
@@ -21,7 +21,7 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo
 - **Scoring**: evaluate outputs of the system.
   - Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics.
 - **Eval**: generate outputs (via Inference or Agents) and perform scoring.
-  - Associated with `EvalTask` resource.
+  - Associated with `Benchmark` resource.
 
 
 Use the following decision tree to decide how to use LlamaStack Evaluation flow.
@@ -77,8 +77,8 @@ system_message = {
     "content": SYSTEM_PROMPT_TEMPLATE,
 }
 
-client.eval_tasks.register(
-    eval_task_id="meta-reference::mmmu",
+client.benchmarks.register(
+    benchmark_id="meta-reference::mmmu",
     dataset_id=f"mmmu-{subset}-{split}",
     scoring_functions=["basic::regex_parser_multiple_choice_answer"],
 )
@@ -135,8 +135,8 @@ eval_rows = client.datasetio.get_rows_paginated(
 ```
 
 ```python
-client.eval_tasks.register(
-    eval_task_id="meta-reference::simpleqa",
+client.benchmarks.register(
+    benchmark_id="meta-reference::simpleqa",
     dataset_id=simpleqa_dataset_id,
     scoring_functions=["llm-as-judge::405b-simpleqa"],
 )
@@ -281,7 +281,7 @@ The following examples give the quick steps to start running evaluations using t
 
 #### Benchmark Evaluation CLI
 Usage: There are 2 inputs necessary for running a benchmark eval
-- `eval-task-id`: the identifier associated with the eval task. Each `EvalTask` is parametrized by
+- `eval-task-id`: the identifier associated with the eval task. Each `Benchmark` is parametrized by
   - `dataset_id`: the identifier associated with the dataset.
   - `List[scoring_function_id]`: list of scoring function identifiers.
 - `eval-task-config`: specifies the configuration of the model / agent to evaluate on.
@@ -289,7 +289,7 @@ Usage: There are 2 inputs necessary for running a benchmark eval
 
 ```
 llama-stack-client eval run_benchmark <eval-task-id> \
---eval-task-config ~/eval_task_config.json \
+--eval-task-config ~/benchmark_config.json \
 --visualize
 ```
 
@@ -309,15 +309,15 @@ llama-stack-client eval run_scoring <scoring_fn_id_1> <scoring_fn_id_2> ... <sco
 --output-dir ./
 ```
 
-#### Defining EvalTaskConfig
-The `EvalTaskConfig` are user specified config to define:
+#### Defining BenchmarkConfig
+The `BenchmarkConfig` are user specified config to define:
 1. `EvalCandidate` to run generation on:
    - `ModelCandidate`: The model will be used for generation through LlamaStack /inference API.
    - `AgentCandidate`: The agentic system specified by AgentConfig will be used for generation through LlamaStack  /agents API.
 2. Optionally scoring function params to allow customization of scoring function behaviour. This is useful to parameterize generic scoring functions such as LLMAsJudge with custom `judge_model` / `judge_prompt`.
 
 
-**Example Benchmark EvalTaskConfig**
+**Example Benchmark BenchmarkConfig**
 ```json
 {
     "type": "benchmark",
@@ -335,7 +335,7 @@ The `EvalTaskConfig` are user specified config to define:
 }
 ```
 
-**Example Application EvalTaskConfig**
+**Example Application BenchmarkConfig**
 ```json
 {
     "type": "app",
diff --git a/docs/source/references/llama_stack_client_cli_reference.md b/docs/source/references/llama_stack_client_cli_reference.md
index b1fb7014f8..d459726cb6 100644
--- a/docs/source/references/llama_stack_client_cli_reference.md
+++ b/docs/source/references/llama_stack_client_cli_reference.md
@@ -161,14 +161,14 @@ Options:
 
 ## Eval Task Management
 
-### `llama-stack-client eval_tasks list`
+### `llama-stack-client benchmarks list`
 ```bash
-$ llama-stack-client eval_tasks list
+$ llama-stack-client benchmarks list
 ```
 
-### `llama-stack-client eval_tasks register`
+### `llama-stack-client benchmarks register`
 ```bash
-$ llama-stack-client eval_tasks register --eval-task-id <eval-task-id> --dataset-id <dataset-id> --scoring-functions <function1> [<function2> ...] [--provider-id <provider-id>] [--provider-eval-task-id <provider-eval-task-id>] [--metadata <metadata>]
+$ llama-stack-client benchmarks register --eval-task-id <eval-task-id> --dataset-id <dataset-id> --scoring-functions <function1> [<function2> ...] [--provider-id <provider-id>] [--provider-eval-task-id <provider-eval-task-id>] [--metadata <metadata>]
 ```
 
 Options:
@@ -191,7 +191,7 @@ Options:
 - `--num-examples`: Optional. Number of examples to evaluate (useful for debugging)
 - `--visualize`: Optional flag. If set, visualizes evaluation results after completion
 
-Example eval_task_config.json:
+Example benchmark_config.json:
 ```json
 {
     "type": "benchmark",
diff --git a/docs/source/references/python_sdk_reference/index.md b/docs/source/references/python_sdk_reference/index.md
index 8a06e22442..eca8c58f54 100644
--- a/docs/source/references/python_sdk_reference/index.md
+++ b/docs/source/references/python_sdk_reference/index.md
@@ -443,20 +443,20 @@ Methods:
 - <code title="get /v1/scoring-functions">client.scoring_functions.<a href="./src/llama_stack_client/resources/scoring_functions.py">list</a>() -> <a href="./src/llama_stack_client/types/scoring_function_list_response.py">ScoringFunctionListResponse</a></code>
 - <code title="post /v1/scoring-functions">client.scoring_functions.<a href="./src/llama_stack_client/resources/scoring_functions.py">register</a>(\*\*<a href="src/llama_stack_client/types/scoring_function_register_params.py">params</a>) -> None</code>
 
-## EvalTasks
+## Benchmarks
 
 Types:
 
 ```python
 from llama_stack_client.types import (
-    EvalTask,
-    ListEvalTasksResponse,
-    EvalTaskListResponse,
+    Benchmark,
+    ListBenchmarksResponse,
+    BenchmarkListResponse,
 )
 ```
 
 Methods:
 
-- <code title="get /v1/eval-tasks/{eval_task_id}">client.eval_tasks.<a href="./src/llama_stack_client/resources/eval_tasks.py">retrieve</a>(eval_task_id) -> <a href="./src/llama_stack_client/types/eval_task.py">Optional[EvalTask]</a></code>
-- <code title="get /v1/eval-tasks">client.eval_tasks.<a href="./src/llama_stack_client/resources/eval_tasks.py">list</a>() -> <a href="./src/llama_stack_client/types/eval_task_list_response.py">EvalTaskListResponse</a></code>
-- <code title="post /v1/eval-tasks">client.eval_tasks.<a href="./src/llama_stack_client/resources/eval_tasks.py">register</a>(\*\*<a href="src/llama_stack_client/types/eval_task_register_params.py">params</a>) -> None</code>
+- <code title="get /v1/eval-tasks/{benchmark_id}">client.benchmarks.<a href="./src/llama_stack_client/resources/benchmarks.py">retrieve</a>(benchmark_id) -> <a href="./src/llama_stack_client/types/benchmark.py">Optional[Benchmark]</a></code>
+- <code title="get /v1/eval-tasks">client.benchmarks.<a href="./src/llama_stack_client/resources/benchmarks.py">list</a>() -> <a href="./src/llama_stack_client/types/benchmark_list_response.py">BenchmarkListResponse</a></code>
+- <code title="post /v1/eval-tasks">client.benchmarks.<a href="./src/llama_stack_client/resources/benchmarks.py">register</a>(\*\*<a href="src/llama_stack_client/types/benchmark_register_params.py">params</a>) -> None</code>
diff --git a/llama_stack/apis/datatypes.py b/llama_stack/apis/datatypes.py
index ccc395b80b..0751b2c9b2 100644
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@@ -28,7 +28,7 @@ class Api(Enum):
     vector_dbs = "vector_dbs"
     datasets = "datasets"
     scoring_functions = "scoring_functions"
-    eval_tasks = "eval_tasks"
+    benchmarks = "benchmarks"
     tool_groups = "tool_groups"
 
     # built-in API
diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py
index ae13a5bd95..16b96d618b 100644
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@@ -38,7 +38,7 @@ class AgentCandidate(BaseModel):
 
 
 @json_schema_type
-class BenchmarkEvalTaskConfig(BaseModel):
+class BenchmarkBenchmarkConfig(BaseModel):
     type: Literal["benchmark"] = "benchmark"
     eval_candidate: EvalCandidate
     num_examples: Optional[int] = Field(
@@ -48,7 +48,7 @@ class BenchmarkEvalTaskConfig(BaseModel):
 
 
 @json_schema_type
-class AppEvalTaskConfig(BaseModel):
+class AppBenchmarkConfig(BaseModel):
     type: Literal["app"] = "app"
     eval_candidate: EvalCandidate
     scoring_params: Dict[str, ScoringFnParams] = Field(
@@ -62,9 +62,9 @@ class AppEvalTaskConfig(BaseModel):
     # we could optinally add any specific dataset config here
 
 
-EvalTaskConfig = register_schema(
-    Annotated[Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type")],
-    name="EvalTaskConfig",
+BenchmarkConfig = register_schema(
+    Annotated[Union[BenchmarkBenchmarkConfig, AppBenchmarkConfig], Field(discriminator="type")],
+    name="BenchmarkConfig",
 )
 
 
@@ -80,7 +80,7 @@ class Eval(Protocol):
     async def run_eval(
         self,
         task_id: str,
-        task_config: EvalTaskConfig,
+        task_config: BenchmarkConfig,
     ) -> Job: ...
 
     @webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST")
@@ -89,7 +89,7 @@ async def evaluate_rows(
         task_id: str,
         input_rows: List[Dict[str, Any]],
         scoring_functions: List[str],
-        task_config: EvalTaskConfig,
+        task_config: BenchmarkConfig,
     ) -> EvaluateResponse: ...
 
     @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET")
diff --git a/llama_stack/apis/eval_tasks/__init__.py b/llama_stack/apis/eval_tasks/__init__.py
index 7ca2167068..f8f5649570 100644
--- a/llama_stack/apis/eval_tasks/__init__.py
+++ b/llama_stack/apis/eval_tasks/__init__.py
@@ -4,4 +4,4 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from .eval_tasks import *  # noqa: F401 F403
+from .benchmarks import *  # noqa: F401 F403
diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py
index 6d12fd2f7f..7c8ed8dc04 100644
--- a/llama_stack/apis/eval_tasks/eval_tasks.py
+++ b/llama_stack/apis/eval_tasks/eval_tasks.py
@@ -11,7 +11,7 @@
 from llama_stack.apis.resource import Resource, ResourceType
 
 
-class CommonEvalTaskFields(BaseModel):
+class CommonBenchmarkFields(BaseModel):
     dataset_id: str
     scoring_functions: List[str]
     metadata: Dict[str, Any] = Field(
@@ -21,66 +21,66 @@ class CommonEvalTaskFields(BaseModel):
 
 
 @json_schema_type
-class EvalTask(CommonEvalTaskFields, Resource):
-    type: Literal[ResourceType.eval_task.value] = ResourceType.eval_task.value
+class Benchmark(CommonBenchmarkFields, Resource):
+    type: Literal[ResourceType.benchmark.value] = ResourceType.benchmark.value
 
     @property
     def task_id(self) -> str:
         return self.identifier
 
     @property
-    def provider_eval_task_id(self) -> str:
+    def provider_benchmark_id(self) -> str:
         return self.provider_resource_id
 
 
-class EvalTaskInput(CommonEvalTaskFields, BaseModel):
+class BenchmarkInput(CommonBenchmarkFields, BaseModel):
     task_id: str
     provider_id: Optional[str] = None
-    provider_eval_task_id: Optional[str] = None
+    provider_benchmark_id: Optional[str] = None
 
 
-class ListEvalTasksResponse(BaseModel):
-    data: List[EvalTask]
+class ListBenchmarksResponse(BaseModel):
+    data: List[Benchmark]
 
 
 @runtime_checkable
-class EvalTasks(Protocol):
+class Benchmarks(Protocol):
     @webmethod(route="/eval/tasks", method="GET")
-    async def list_eval_tasks(self) -> ListEvalTasksResponse: ...
+    async def list_benchmarks(self) -> ListBenchmarksResponse: ...
 
     @webmethod(route="/eval/tasks/{task_id}", method="GET")
-    async def get_eval_task(
+    async def get_benchmark(
         self,
         task_id: str,
-    ) -> Optional[EvalTask]: ...
+    ) -> Optional[Benchmark]: ...
 
     @webmethod(route="/eval/tasks", method="POST")
-    async def register_eval_task(
+    async def register_benchmark(
         self,
         task_id: str,
         dataset_id: str,
         scoring_functions: List[str],
-        provider_eval_task_id: Optional[str] = None,
+        provider_benchmark_id: Optional[str] = None,
         provider_id: Optional[str] = None,
         metadata: Optional[Dict[str, Any]] = None,
     ) -> None: ...
 
     @webmethod(route="/eval-tasks", method="GET")
-    async def DEPRECATED_list_eval_tasks(self) -> ListEvalTasksResponse: ...
+    async def DEPRECATED_list_benchmarks(self) -> ListBenchmarksResponse: ...
 
-    @webmethod(route="/eval-tasks/{eval_task_id}", method="GET")
-    async def DEPRECATED_get_eval_task(
+    @webmethod(route="/eval-tasks/{benchmark_id}", method="GET")
+    async def DEPRECATED_get_benchmark(
         self,
-        eval_task_id: str,
-    ) -> Optional[EvalTask]: ...
+        benchmark_id: str,
+    ) -> Optional[Benchmark]: ...
 
     @webmethod(route="/eval-tasks", method="POST")
-    async def DEPRECATED_register_eval_task(
+    async def DEPRECATED_register_benchmark(
         self,
-        eval_task_id: str,
+        benchmark_id: str,
         dataset_id: str,
         scoring_functions: List[str],
-        provider_eval_task_id: Optional[str] = None,
+        provider_benchmark_id: Optional[str] = None,
         provider_id: Optional[str] = None,
         metadata: Optional[Dict[str, Any]] = None,
     ) -> None: ...
diff --git a/llama_stack/apis/resource.py b/llama_stack/apis/resource.py
index 145113a5d6..70ec63c55d 100644
--- a/llama_stack/apis/resource.py
+++ b/llama_stack/apis/resource.py
@@ -15,7 +15,7 @@ class ResourceType(Enum):
     vector_db = "vector_db"
     dataset = "dataset"
     scoring_function = "scoring_function"
-    eval_task = "eval_task"
+    benchmark = "benchmark"
     tool = "tool"
     tool_group = "tool_group"
 
diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py
index 97706f22a5..75ab73b9ba 100644
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@@ -8,10 +8,11 @@
 
 from pydantic import BaseModel, Field
 
+from llama_stack.apis.benchmarks import Benchmark, BenchmarkInput
+
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Dataset, DatasetInput
 from llama_stack.apis.eval import Eval
-from llama_stack.apis.eval_tasks import EvalTask, EvalTaskInput
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.models import Model, ModelInput
 from llama_stack.apis.safety import Safety
@@ -37,7 +38,7 @@
     VectorDB,
     Dataset,
     ScoringFn,
-    EvalTask,
+    Benchmark,
     Tool,
     ToolGroup,
 ]
@@ -50,7 +51,7 @@
         VectorDB,
         Dataset,
         ScoringFn,
-        EvalTask,
+        Benchmark,
         Tool,
         ToolGroup,
     ],
@@ -173,7 +174,7 @@ class StackRunConfig(BaseModel):
     vector_dbs: List[VectorDBInput] = Field(default_factory=list)
     datasets: List[DatasetInput] = Field(default_factory=list)
     scoring_fns: List[ScoringFnInput] = Field(default_factory=list)
-    eval_tasks: List[EvalTaskInput] = Field(default_factory=list)
+    benchmarks: List[BenchmarkInput] = Field(default_factory=list)
     tool_groups: List[ToolGroupInput] = Field(default_factory=list)
 
     server: ServerConfig = Field(
diff --git a/llama_stack/distribution/distribution.py b/llama_stack/distribution/distribution.py
index 2dcf38463b..384e2c3c89 100644
--- a/llama_stack/distribution/distribution.py
+++ b/llama_stack/distribution/distribution.py
@@ -44,7 +44,7 @@ def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]:
             router_api=Api.scoring,
         ),
         AutoRoutedApiInfo(
-            routing_table_api=Api.eval_tasks,
+            routing_table_api=Api.benchmarks,
             router_api=Api.eval,
         ),
         AutoRoutedApiInfo(
diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py
index 353c2971ba..0bc2e774c1 100644
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@@ -9,10 +9,10 @@
 from typing import Any, Dict, List, Set
 
 from llama_stack.apis.agents import Agents
+from llama_stack.apis.benchmarks import Benchmarks
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.eval import Eval
-from llama_stack.apis.eval_tasks import EvalTasks
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.models import Models
@@ -37,8 +37,8 @@
 from llama_stack.distribution.utils.dynamic import instantiate_class_type
 from llama_stack.providers.datatypes import (
     Api,
+    BenchmarksProtocolPrivate,
     DatasetsProtocolPrivate,
-    EvalTasksProtocolPrivate,
     InlineProviderSpec,
     ModelsProtocolPrivate,
     ProviderSpec,
@@ -73,7 +73,7 @@ def api_protocol_map() -> Dict[Api, Any]:
         Api.scoring: Scoring,
         Api.scoring_functions: ScoringFunctions,
         Api.eval: Eval,
-        Api.eval_tasks: EvalTasks,
+        Api.benchmarks: Benchmarks,
         Api.post_training: PostTraining,
         Api.tool_groups: ToolGroups,
         Api.tool_runtime: ToolRuntime,
@@ -92,7 +92,7 @@ def additional_protocols_map() -> Dict[Api, Any]:
             ScoringFunctions,
             Api.scoring_functions,
         ),
-        Api.eval: (EvalTasksProtocolPrivate, EvalTasks, Api.eval_tasks),
+        Api.eval: (BenchmarksProtocolPrivate, Benchmarks, Api.benchmarks),
     }
 
 
diff --git a/llama_stack/distribution/routers/__init__.py b/llama_stack/distribution/routers/__init__.py
index 156cda3859..24defdcacb 100644
--- a/llama_stack/distribution/routers/__init__.py
+++ b/llama_stack/distribution/routers/__init__.py
@@ -12,8 +12,8 @@
 from llama_stack.providers.datatypes import Api, RoutingTable
 
 from .routing_tables import (
+    BenchmarksRoutingTable,
     DatasetsRoutingTable,
-    EvalTasksRoutingTable,
     ModelsRoutingTable,
     ScoringFunctionsRoutingTable,
     ShieldsRoutingTable,
@@ -34,7 +34,7 @@ async def get_routing_table_impl(
         "shields": ShieldsRoutingTable,
         "datasets": DatasetsRoutingTable,
         "scoring_functions": ScoringFunctionsRoutingTable,
-        "eval_tasks": EvalTasksRoutingTable,
+        "benchmarks": BenchmarksRoutingTable,
         "tool_groups": ToolGroupsRoutingTable,
     }
 
diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py
index 6cddcf73cb..6697b03e26 100644
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@@ -9,9 +9,9 @@
 from llama_stack.apis.common.content_types import InterleavedContent, URL
 from llama_stack.apis.datasetio import DatasetIO, PaginatedRowsResult
 from llama_stack.apis.eval import (
-    AppEvalTaskConfig,
+    AppBenchmarkConfig,
+    BenchmarkConfig,
     Eval,
-    EvalTaskConfig,
     EvaluateResponse,
     Job,
     JobStatus,
@@ -348,7 +348,7 @@ async def shutdown(self) -> None:
     async def run_eval(
         self,
         task_id: str,
-        task_config: AppEvalTaskConfig,
+        task_config: AppBenchmarkConfig,
     ) -> Job:
         return await self.routing_table.get_provider_impl(task_id).run_eval(
             task_id=task_id,
@@ -360,7 +360,7 @@ async def evaluate_rows(
         task_id: str,
         input_rows: List[Dict[str, Any]],
         scoring_functions: List[str],
-        task_config: EvalTaskConfig,
+        task_config: BenchmarkConfig,
     ) -> EvaluateResponse:
         return await self.routing_table.get_provider_impl(task_id).evaluate_rows(
             task_id=task_id,
diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index 98e3afd3ff..6c1b06ed6b 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -8,10 +8,11 @@
 
 from pydantic import TypeAdapter
 
+from llama_stack.apis.benchmarks import Benchmark, Benchmarks, ListBenchmarksResponse
+
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.common.type_system import ParamType
 from llama_stack.apis.datasets import Dataset, Datasets, ListDatasetsResponse
-from llama_stack.apis.eval_tasks import EvalTask, EvalTasks, ListEvalTasksResponse
 from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType
 from llama_stack.apis.resource import ResourceType
 from llama_stack.apis.scoring_functions import (
@@ -60,7 +61,7 @@ async def register_object_with_provider(obj: RoutableObject, p: Any) -> Routable
     elif api == Api.scoring:
         return await p.register_scoring_function(obj)
     elif api == Api.eval:
-        return await p.register_eval_task(obj)
+        return await p.register_benchmark(obj)
     elif api == Api.tool_runtime:
         return await p.register_tool(obj)
     else:
@@ -121,7 +122,7 @@ async def add_objects(objs: List[RoutableObjectWithProvider], provider_id: str,
                 scoring_functions = await p.list_scoring_functions()
                 await add_objects(scoring_functions, pid, ScoringFn)
             elif api == Api.eval:
-                p.eval_task_store = self
+                p.benchmark_store = self
             elif api == Api.tool_runtime:
                 p.tool_store = self
 
@@ -141,8 +142,8 @@ def apiname_object():
                 return ("DatasetIO", "dataset")
             elif isinstance(self, ScoringFunctionsRoutingTable):
                 return ("Scoring", "scoring_function")
-            elif isinstance(self, EvalTasksRoutingTable):
-                return ("Eval", "eval_task")
+            elif isinstance(self, BenchmarksRoutingTable):
+                return ("Eval", "benchmark")
             elif isinstance(self, ToolGroupsRoutingTable):
                 return ("Tools", "tool")
             else:
@@ -428,20 +429,20 @@ async def register_scoring_function(
         await self.register_object(scoring_fn)
 
 
-class EvalTasksRoutingTable(CommonRoutingTableImpl, EvalTasks):
-    async def list_eval_tasks(self) -> ListEvalTasksResponse:
-        return ListEvalTasksResponse(data=await self.get_all_with_type("eval_task"))
+class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
+    async def list_benchmarks(self) -> ListBenchmarksResponse:
+        return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark"))
 
-    async def get_eval_task(self, task_id: str) -> Optional[EvalTask]:
-        return await self.get_object_by_identifier("eval_task", task_id)
+    async def get_benchmark(self, task_id: str) -> Optional[Benchmark]:
+        return await self.get_object_by_identifier("benchmark", task_id)
 
-    async def register_eval_task(
+    async def register_benchmark(
         self,
         task_id: str,
         dataset_id: str,
         scoring_functions: List[str],
         metadata: Optional[Dict[str, Any]] = None,
-        provider_eval_task_id: Optional[str] = None,
+        provider_benchmark_id: Optional[str] = None,
         provider_id: Optional[str] = None,
     ) -> None:
         if metadata is None:
@@ -453,33 +454,33 @@ async def register_eval_task(
                 raise ValueError(
                     "No provider specified and multiple providers available. Please specify a provider_id."
                 )
-        if provider_eval_task_id is None:
-            provider_eval_task_id = task_id
-        eval_task = EvalTask(
+        if provider_benchmark_id is None:
+            provider_benchmark_id = task_id
+        benchmark = Benchmark(
             identifier=task_id,
             dataset_id=dataset_id,
             scoring_functions=scoring_functions,
             metadata=metadata,
             provider_id=provider_id,
-            provider_resource_id=provider_eval_task_id,
+            provider_resource_id=provider_benchmark_id,
         )
-        await self.register_object(eval_task)
+        await self.register_object(benchmark)
 
-    async def DEPRECATED_list_eval_tasks(self) -> ListEvalTasksResponse:
+    async def DEPRECATED_list_benchmarks(self) -> ListBenchmarksResponse:
         raise DeprecationWarning("Use /eval/tasks instead")
 
-    async def DEPRECATED_get_eval_task(
+    async def DEPRECATED_get_benchmark(
         self,
-        eval_task_id: str,
-    ) -> Optional[EvalTask]:
+        benchmark_id: str,
+    ) -> Optional[Benchmark]:
         raise DeprecationWarning("Use /eval/tasks instead")
 
-    async def DEPRECATED_register_eval_task(
+    async def DEPRECATED_register_benchmark(
         self,
-        eval_task_id: str,
+        benchmark_id: str,
         dataset_id: str,
         scoring_functions: List[str],
-        provider_eval_task_id: Optional[str] = None,
+        provider_benchmark_id: Optional[str] = None,
         provider_id: Optional[str] = None,
         metadata: Optional[Dict[str, Any]] = None,
     ) -> None:
diff --git a/llama_stack/distribution/stack.py b/llama_stack/distribution/stack.py
index 2baad8ac45..9335dc3a95 100644
--- a/llama_stack/distribution/stack.py
+++ b/llama_stack/distribution/stack.py
@@ -15,10 +15,10 @@
 
 from llama_stack.apis.agents import Agents
 from llama_stack.apis.batch_inference import BatchInference
+from llama_stack.apis.benchmarks import Benchmarks
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.eval import Eval
-from llama_stack.apis.eval_tasks import EvalTasks
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.models import Models
@@ -53,7 +53,7 @@ class LlamaStack(
     PostTraining,
     VectorIO,
     Eval,
-    EvalTasks,
+    Benchmarks,
     Scoring,
     ScoringFunctions,
     DatasetIO,
@@ -78,7 +78,7 @@ class LlamaStack(
         "register_scoring_function",
         "list_scoring_functions",
     ),
-    ("eval_tasks", Api.eval_tasks, "register_eval_task", "list_eval_tasks"),
+    ("benchmarks", Api.benchmarks, "register_benchmark", "list_benchmarks"),
     ("tool_groups", Api.tool_groups, "register_tool_group", "list_tool_groups"),
 ]
 
diff --git a/llama_stack/distribution/ui/README.md b/llama_stack/distribution/ui/README.md
index c0a2597af5..8fceb5c63c 100644
--- a/llama_stack/distribution/ui/README.md
+++ b/llama_stack/distribution/ui/README.md
@@ -26,7 +26,7 @@ $ llama-stack-client datasets register \
 ```
 
 ```bash
-$ llama-stack-client eval_tasks register \
+$ llama-stack-client benchmarks register \
 --eval-task-id meta-reference-mmlu \
 --provider-id meta-reference \
 --dataset-id mmlu \
diff --git a/llama_stack/distribution/ui/page/distribution/eval_tasks.py b/llama_stack/distribution/ui/page/distribution/eval_tasks.py
index f589696631..b83023901d 100644
--- a/llama_stack/distribution/ui/page/distribution/eval_tasks.py
+++ b/llama_stack/distribution/ui/page/distribution/eval_tasks.py
@@ -8,12 +8,12 @@
 from modules.api import llama_stack_api
 
 
-def eval_tasks():
+def benchmarks():
     # Eval Tasks Section
     st.header("Eval Tasks")
 
-    eval_tasks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.eval_tasks.list()}
+    benchmarks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.benchmarks.list()}
 
-    if len(eval_tasks_info) > 0:
-        selected_eval_task = st.selectbox("Select an eval task", list(eval_tasks_info.keys()), key="eval_task_inspect")
-        st.json(eval_tasks_info[selected_eval_task], expanded=True)
+    if len(benchmarks_info) > 0:
+        selected_benchmark = st.selectbox("Select an eval task", list(benchmarks_info.keys()), key="benchmark_inspect")
+        st.json(benchmarks_info[selected_benchmark], expanded=True)
diff --git a/llama_stack/distribution/ui/page/distribution/resources.py b/llama_stack/distribution/ui/page/distribution/resources.py
index 38d4945708..a86fda8565 100644
--- a/llama_stack/distribution/ui/page/distribution/resources.py
+++ b/llama_stack/distribution/ui/page/distribution/resources.py
@@ -4,8 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+from page.distribution.benchmarks import benchmarks
 from page.distribution.datasets import datasets
-from page.distribution.eval_tasks import eval_tasks
 from page.distribution.models import models
 from page.distribution.scoring_functions import scoring_functions
 from page.distribution.shields import shields
@@ -36,7 +36,7 @@ def resources_page():
         },
     )
     if selected_resource == "Eval Tasks":
-        eval_tasks()
+        benchmarks()
     elif selected_resource == "Vector Databases":
         vector_dbs()
     elif selected_resource == "Datasets":
diff --git a/llama_stack/distribution/ui/page/evaluations/native_eval.py b/llama_stack/distribution/ui/page/evaluations/native_eval.py
index c4a44990f8..e24da4eb38 100644
--- a/llama_stack/distribution/ui/page/evaluations/native_eval.py
+++ b/llama_stack/distribution/ui/page/evaluations/native_eval.py
@@ -13,28 +13,28 @@
 from modules.api import llama_stack_api
 
 
-def select_eval_task_1():
+def select_benchmark_1():
     # Select Eval Tasks
     st.subheader("1. Choose An Eval Task")
-    eval_tasks = llama_stack_api.client.eval_tasks.list()
-    eval_tasks = {et.identifier: et for et in eval_tasks}
-    eval_tasks_names = list(eval_tasks.keys())
-    selected_eval_task = st.selectbox(
+    benchmarks = llama_stack_api.client.benchmarks.list()
+    benchmarks = {et.identifier: et for et in benchmarks}
+    benchmarks_names = list(benchmarks.keys())
+    selected_benchmark = st.selectbox(
         "Choose an eval task.",
-        options=eval_tasks_names,
+        options=benchmarks_names,
         help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
     )
     with st.expander("View Eval Task"):
-        st.json(eval_tasks[selected_eval_task], expanded=True)
+        st.json(benchmarks[selected_benchmark], expanded=True)
 
-    st.session_state["selected_eval_task"] = selected_eval_task
-    st.session_state["eval_tasks"] = eval_tasks
+    st.session_state["selected_benchmark"] = selected_benchmark
+    st.session_state["benchmarks"] = benchmarks
     if st.button("Confirm", key="confirm_1"):
-        st.session_state["selected_eval_task_1_next"] = True
+        st.session_state["selected_benchmark_1_next"] = True
 
 
 def define_eval_candidate_2():
-    if not st.session_state.get("selected_eval_task_1_next", None):
+    if not st.session_state.get("selected_benchmark_1_next", None):
         return
 
     st.subheader("2. Define Eval Candidate")
@@ -163,11 +163,11 @@ def run_evaluation_3():
         Review the configurations that will be used for this evaluation run, make any necessary changes, and then click the "Run Evaluation" button.
         """
     )
-    selected_eval_task = st.session_state["selected_eval_task"]
-    eval_tasks = st.session_state["eval_tasks"]
+    selected_benchmark = st.session_state["selected_benchmark"]
+    benchmarks = st.session_state["benchmarks"]
     eval_candidate = st.session_state["eval_candidate"]
 
-    dataset_id = eval_tasks[selected_eval_task].dataset_id
+    dataset_id = benchmarks[selected_benchmark].dataset_id
     rows = llama_stack_api.client.datasetio.get_rows_paginated(
         dataset_id=dataset_id,
         rows_in_page=-1,
@@ -182,16 +182,16 @@ def run_evaluation_3():
         help="Number of examples from the dataset to evaluate. ",
     )
 
-    eval_task_config = {
+    benchmark_config = {
         "type": "benchmark",
         "eval_candidate": eval_candidate,
         "scoring_params": {},
     }
 
     with st.expander("View Evaluation Task", expanded=True):
-        st.json(eval_tasks[selected_eval_task], expanded=True)
+        st.json(benchmarks[selected_benchmark], expanded=True)
     with st.expander("View Evaluation Task Configuration", expanded=True):
-        st.json(eval_task_config, expanded=True)
+        st.json(benchmark_config, expanded=True)
 
     # Add run button and handle evaluation
     if st.button("Run Evaluation"):
@@ -211,10 +211,10 @@ def run_evaluation_3():
             progress_bar.progress(progress, text=progress_text)
             # Run evaluation for current row
             eval_res = llama_stack_api.client.eval.evaluate_rows(
-                task_id=selected_eval_task,
+                task_id=selected_benchmark,
                 input_rows=[r],
-                scoring_functions=eval_tasks[selected_eval_task].scoring_functions,
-                task_config=eval_task_config,
+                scoring_functions=benchmarks[selected_benchmark].scoring_functions,
+                task_config=benchmark_config,
             )
 
             for k in r.keys():
@@ -227,7 +227,7 @@ def run_evaluation_3():
                     output_res[k] = []
                 output_res[k].append(eval_res.generations[0][k])
 
-            for scoring_fn in eval_tasks[selected_eval_task].scoring_functions:
+            for scoring_fn in benchmarks[selected_benchmark].scoring_functions:
                 if scoring_fn not in output_res:
                     output_res[scoring_fn] = []
                 output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0])
@@ -247,7 +247,7 @@ def native_evaluation_page():
     st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙")
     st.title("📊 Evaluations (Generation + Scoring)")
 
-    select_eval_task_1()
+    select_benchmark_1()
     define_eval_candidate_2()
     run_evaluation_3()
 
diff --git a/llama_stack/providers/datatypes.py b/llama_stack/providers/datatypes.py
index d0c448f8c6..494a46b036 100644
--- a/llama_stack/providers/datatypes.py
+++ b/llama_stack/providers/datatypes.py
@@ -10,10 +10,11 @@
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field
 
+from llama_stack.apis.benchmarks import Benchmark
+
 from llama_stack.apis.datasets import Dataset
 
 from llama_stack.apis.datatypes import Api
-from llama_stack.apis.eval_tasks import EvalTask
 from llama_stack.apis.models import Model
 from llama_stack.apis.scoring_functions import ScoringFn
 from llama_stack.apis.shields import Shield
@@ -49,8 +50,8 @@ async def list_scoring_functions(self) -> List[ScoringFn]: ...
     async def register_scoring_function(self, scoring_fn: ScoringFn) -> None: ...
 
 
-class EvalTasksProtocolPrivate(Protocol):
-    async def register_eval_task(self, eval_task: EvalTask) -> None: ...
+class BenchmarksProtocolPrivate(Protocol):
+    async def register_benchmark(self, benchmark: Benchmark) -> None: ...
 
 
 class ToolsProtocolPrivate(Protocol):
diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py
index 1db627007a..07310f59c0 100644
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@@ -8,13 +8,13 @@
 from tqdm import tqdm
 
 from llama_stack.apis.agents import Agents, StepType
+from llama_stack.apis.benchmarks import Benchmark
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.eval_tasks import EvalTask
 from llama_stack.apis.inference import Inference, UserMessage
 from llama_stack.apis.scoring import Scoring
 from llama_stack.distribution.datatypes import Api
-from llama_stack.providers.datatypes import EvalTasksProtocolPrivate
+from llama_stack.providers.datatypes import BenchmarksProtocolPrivate
 
 from llama_stack.providers.inline.agents.meta_reference.agent_instance import (
     MEMORY_QUERY_TOOL,
@@ -27,16 +27,16 @@
 from llama_stack.providers.utils.kvstore import kvstore_impl
 
 from .....apis.common.job_types import Job
-from .....apis.eval.eval import Eval, EvalTaskConfig, EvaluateResponse, JobStatus
+from .....apis.eval.eval import BenchmarkConfig, Eval, EvaluateResponse, JobStatus
 
 from .config import MetaReferenceEvalConfig
 
-EVAL_TASKS_PREFIX = "eval_tasks:"
+EVAL_TASKS_PREFIX = "benchmarks:"
 
 
 class MetaReferenceEvalImpl(
     Eval,
-    EvalTasksProtocolPrivate,
+    BenchmarksProtocolPrivate,
 ):
     def __init__(
         self,
@@ -57,36 +57,36 @@ def __init__(
         # TODO: assume sync job, will need jobs API for async scheduling
         self.jobs = {}
 
-        self.eval_tasks = {}
+        self.benchmarks = {}
 
     async def initialize(self) -> None:
         self.kvstore = await kvstore_impl(self.config.kvstore)
-        # Load existing eval_tasks from kvstore
+        # Load existing benchmarks from kvstore
         start_key = EVAL_TASKS_PREFIX
         end_key = f"{EVAL_TASKS_PREFIX}\xff"
-        stored_eval_tasks = await self.kvstore.range(start_key, end_key)
+        stored_benchmarks = await self.kvstore.range(start_key, end_key)
 
-        for eval_task in stored_eval_tasks:
-            eval_task = EvalTask.model_validate_json(eval_task)
-            self.eval_tasks[eval_task.identifier] = eval_task
+        for benchmark in stored_benchmarks:
+            benchmark = Benchmark.model_validate_json(benchmark)
+            self.benchmarks[benchmark.identifier] = benchmark
 
     async def shutdown(self) -> None: ...
 
-    async def register_eval_task(self, task_def: EvalTask) -> None:
+    async def register_benchmark(self, task_def: Benchmark) -> None:
         # Store in kvstore
         key = f"{EVAL_TASKS_PREFIX}{task_def.identifier}"
         await self.kvstore.set(
             key=key,
             value=task_def.model_dump_json(),
         )
-        self.eval_tasks[task_def.identifier] = task_def
+        self.benchmarks[task_def.identifier] = task_def
 
     async def run_eval(
         self,
         task_id: str,
-        task_config: EvalTaskConfig,
+        task_config: BenchmarkConfig,
     ) -> Job:
-        task_def = self.eval_tasks[task_id]
+        task_def = self.benchmarks[task_id]
         dataset_id = task_def.dataset_id
         candidate = task_config.eval_candidate
         scoring_functions = task_def.scoring_functions
@@ -110,7 +110,7 @@ async def run_eval(
         return Job(job_id=job_id)
 
     async def _run_agent_generation(
-        self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig
+        self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig
     ) -> List[Dict[str, Any]]:
         candidate = task_config.eval_candidate
         create_response = await self.agents_api.create_agent(candidate.config)
@@ -153,7 +153,7 @@ async def _run_agent_generation(
         return generations
 
     async def _run_model_generation(
-        self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig
+        self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig
     ) -> List[Dict[str, Any]]:
         candidate = task_config.eval_candidate
         assert candidate.sampling_params.max_tokens is not None, "SamplingParams.max_tokens must be provided"
@@ -192,7 +192,7 @@ async def evaluate_rows(
         task_id: str,
         input_rows: List[Dict[str, Any]],
         scoring_functions: List[str],
-        task_config: EvalTaskConfig,
+        task_config: BenchmarkConfig,
     ) -> EvaluateResponse:
         candidate = task_config.eval_candidate
         if candidate.type == "agent":
diff --git a/llama_stack/providers/tests/eval/test_eval.py b/llama_stack/providers/tests/eval/test_eval.py
index 40835bf53d..78351a28ef 100644
--- a/llama_stack/providers/tests/eval/test_eval.py
+++ b/llama_stack/providers/tests/eval/test_eval.py
@@ -11,8 +11,8 @@
 from llama_stack.apis.common.type_system import ChatCompletionInputType, StringType
 
 from llama_stack.apis.eval.eval import (
-    AppEvalTaskConfig,
-    BenchmarkEvalTaskConfig,
+    AppBenchmarkConfig,
+    BenchmarkBenchmarkConfig,
     ModelCandidate,
 )
 from llama_stack.apis.inference import SamplingParams
@@ -30,18 +30,18 @@
 
 class Testeval:
     @pytest.mark.asyncio
-    async def test_eval_tasks_list(self, eval_stack):
+    async def test_benchmarks_list(self, eval_stack):
         # NOTE: this needs you to ensure that you are starting from a clean state
         # but so far we don't have an unregister API unfortunately, so be careful
-        eval_tasks_impl = eval_stack[Api.eval_tasks]
-        response = await eval_tasks_impl.list_eval_tasks()
+        benchmarks_impl = eval_stack[Api.benchmarks]
+        response = await benchmarks_impl.list_benchmarks()
         assert isinstance(response, list)
 
     @pytest.mark.asyncio
     async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model):
-        eval_impl, eval_tasks_impl, datasetio_impl, datasets_impl, models_impl = (
+        eval_impl, benchmarks_impl, datasetio_impl, datasets_impl, models_impl = (
             eval_stack[Api.eval],
-            eval_stack[Api.eval_tasks],
+            eval_stack[Api.benchmarks],
             eval_stack[Api.datasetio],
             eval_stack[Api.datasets],
             eval_stack[Api.models],
@@ -60,8 +60,8 @@ async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model
             "basic::equality",
         ]
         task_id = "meta-reference::app_eval"
-        await eval_tasks_impl.register_eval_task(
-            eval_task_id=task_id,
+        await benchmarks_impl.register_benchmark(
+            benchmark_id=task_id,
             dataset_id="test_dataset_for_eval",
             scoring_functions=scoring_functions,
         )
@@ -69,7 +69,7 @@ async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model
             task_id=task_id,
             input_rows=rows.rows,
             scoring_functions=scoring_functions,
-            task_config=AppEvalTaskConfig(
+            task_config=AppBenchmarkConfig(
                 eval_candidate=ModelCandidate(
                     model=inference_model,
                     sampling_params=SamplingParams(),
@@ -92,9 +92,9 @@ async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model
 
     @pytest.mark.asyncio
     async def test_eval_run_eval(self, eval_stack, inference_model, judge_model):
-        eval_impl, eval_tasks_impl, datasets_impl, models_impl = (
+        eval_impl, benchmarks_impl, datasets_impl, models_impl = (
             eval_stack[Api.eval],
-            eval_stack[Api.eval_tasks],
+            eval_stack[Api.benchmarks],
             eval_stack[Api.datasets],
             eval_stack[Api.models],
         )
@@ -106,14 +106,14 @@ async def test_eval_run_eval(self, eval_stack, inference_model, judge_model):
         ]
 
         task_id = "meta-reference::app_eval-2"
-        await eval_tasks_impl.register_eval_task(
-            eval_task_id=task_id,
+        await benchmarks_impl.register_benchmark(
+            benchmark_id=task_id,
             dataset_id="test_dataset_for_eval",
             scoring_functions=scoring_functions,
         )
         response = await eval_impl.run_eval(
             task_id=task_id,
-            task_config=AppEvalTaskConfig(
+            task_config=AppBenchmarkConfig(
                 eval_candidate=ModelCandidate(
                     model=inference_model,
                     sampling_params=SamplingParams(),
@@ -131,9 +131,9 @@ async def test_eval_run_eval(self, eval_stack, inference_model, judge_model):
 
     @pytest.mark.asyncio
     async def test_eval_run_benchmark_eval(self, eval_stack, inference_model):
-        eval_impl, eval_tasks_impl, datasets_impl, models_impl = (
+        eval_impl, benchmarks_impl, datasets_impl, models_impl = (
             eval_stack[Api.eval],
-            eval_stack[Api.eval_tasks],
+            eval_stack[Api.benchmarks],
             eval_stack[Api.datasets],
             eval_stack[Api.models],
         )
@@ -159,20 +159,20 @@ async def test_eval_run_benchmark_eval(self, eval_stack, inference_model):
         )
 
         # register eval task
-        await eval_tasks_impl.register_eval_task(
-            eval_task_id="meta-reference-mmlu",
+        await benchmarks_impl.register_benchmark(
+            benchmark_id="meta-reference-mmlu",
             dataset_id="mmlu",
             scoring_functions=["basic::regex_parser_multiple_choice_answer"],
         )
 
         # list benchmarks
-        response = await eval_tasks_impl.list_eval_tasks()
+        response = await benchmarks_impl.list_benchmarks()
         assert len(response) > 0
 
         benchmark_id = "meta-reference-mmlu"
         response = await eval_impl.run_eval(
             task_id=benchmark_id,
-            task_config=BenchmarkEvalTaskConfig(
+            task_config=BenchmarkBenchmarkConfig(
                 eval_candidate=ModelCandidate(
                     model=inference_model,
                     sampling_params=SamplingParams(),
diff --git a/llama_stack/providers/tests/resolver.py b/llama_stack/providers/tests/resolver.py
index 0ff6327170..092514079a 100644
--- a/llama_stack/providers/tests/resolver.py
+++ b/llama_stack/providers/tests/resolver.py
@@ -10,8 +10,9 @@
 
 from pydantic import BaseModel
 
+from llama_stack.apis.benchmarks import BenchmarkInput
+
 from llama_stack.apis.datasets import DatasetInput
-from llama_stack.apis.eval_tasks import EvalTaskInput
 from llama_stack.apis.models import ModelInput
 from llama_stack.apis.scoring_functions import ScoringFnInput
 from llama_stack.apis.shields import ShieldInput
@@ -42,7 +43,7 @@ async def construct_stack_for_test(
     vector_dbs: Optional[List[VectorDBInput]] = None,
     datasets: Optional[List[DatasetInput]] = None,
     scoring_fns: Optional[List[ScoringFnInput]] = None,
-    eval_tasks: Optional[List[EvalTaskInput]] = None,
+    benchmarks: Optional[List[BenchmarkInput]] = None,
     tool_groups: Optional[List[ToolGroupInput]] = None,
 ) -> TestStack:
     sqlite_file = tempfile.NamedTemporaryFile(delete=False, suffix=".db")
@@ -56,7 +57,7 @@ async def construct_stack_for_test(
         vector_dbs=vector_dbs or [],
         datasets=datasets or [],
         scoring_fns=scoring_fns or [],
-        eval_tasks=eval_tasks or [],
+        benchmarks=benchmarks or [],
         tool_groups=tool_groups or [],
     )
     run_config = parse_and_maybe_upgrade_config(run_config)
diff --git a/llama_stack/templates/bedrock/run.yaml b/llama_stack/templates/bedrock/run.yaml
index 39408c1bd7..81d8997163 100644
--- a/llama_stack/templates/bedrock/run.yaml
+++ b/llama_stack/templates/bedrock/run.yaml
@@ -107,7 +107,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/cerebras/run.yaml b/llama_stack/templates/cerebras/run.yaml
index 5a70890a89..71003d5b0c 100644
--- a/llama_stack/templates/cerebras/run.yaml
+++ b/llama_stack/templates/cerebras/run.yaml
@@ -109,7 +109,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/dell/run-with-safety.yaml b/llama_stack/templates/dell/run-with-safety.yaml
index bdc82d03a9..493beeb0d7 100644
--- a/llama_stack/templates/dell/run-with-safety.yaml
+++ b/llama_stack/templates/dell/run-with-safety.yaml
@@ -108,7 +108,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: brave-search
diff --git a/llama_stack/templates/dell/run.yaml b/llama_stack/templates/dell/run.yaml
index 2ba62a7821..cb045c714b 100644
--- a/llama_stack/templates/dell/run.yaml
+++ b/llama_stack/templates/dell/run.yaml
@@ -99,7 +99,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: brave-search
diff --git a/llama_stack/templates/experimental-post-training/run.yaml b/llama_stack/templates/experimental-post-training/run.yaml
index 75d103c9fa..e70ccdd2de 100644
--- a/llama_stack/templates/experimental-post-training/run.yaml
+++ b/llama_stack/templates/experimental-post-training/run.yaml
@@ -85,4 +85,4 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
diff --git a/llama_stack/templates/fireworks/run-with-safety.yaml b/llama_stack/templates/fireworks/run-with-safety.yaml
index a4b425436f..3bad366d1c 100644
--- a/llama_stack/templates/fireworks/run-with-safety.yaml
+++ b/llama_stack/templates/fireworks/run-with-safety.yaml
@@ -164,7 +164,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml
index a497317bde..e60fdecb2f 100644
--- a/llama_stack/templates/fireworks/run.yaml
+++ b/llama_stack/templates/fireworks/run.yaml
@@ -153,7 +153,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/hf-endpoint/run-with-safety.yaml b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
index 0329f580ba..0dfa9fade3 100644
--- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml
+++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
@@ -116,7 +116,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/hf-endpoint/run.yaml b/llama_stack/templates/hf-endpoint/run.yaml
index 8163fe28e6..fdb19b63f9 100644
--- a/llama_stack/templates/hf-endpoint/run.yaml
+++ b/llama_stack/templates/hf-endpoint/run.yaml
@@ -106,7 +106,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/hf-serverless/run-with-safety.yaml b/llama_stack/templates/hf-serverless/run-with-safety.yaml
index 9cee920a5b..541d7c8645 100644
--- a/llama_stack/templates/hf-serverless/run-with-safety.yaml
+++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml
@@ -116,7 +116,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/hf-serverless/run.yaml b/llama_stack/templates/hf-serverless/run.yaml
index c8ad0d38da..301c3c1124 100644
--- a/llama_stack/templates/hf-serverless/run.yaml
+++ b/llama_stack/templates/hf-serverless/run.yaml
@@ -106,7 +106,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
index 0faaabb159..7eb704e3f9 100644
--- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
@@ -118,7 +118,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/meta-reference-gpu/run.yaml b/llama_stack/templates/meta-reference-gpu/run.yaml
index 6ffe1fa360..92bdbabad0 100644
--- a/llama_stack/templates/meta-reference-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run.yaml
@@ -107,7 +107,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
index 5ff87a9010..9fb506cbb2 100644
--- a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
@@ -109,7 +109,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml
index 6dc325e9dd..6702bf6ea5 100644
--- a/llama_stack/templates/nvidia/run.yaml
+++ b/llama_stack/templates/nvidia/run.yaml
@@ -139,7 +139,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml
index 5b5c9c253a..bc5fe4ce9b 100644
--- a/llama_stack/templates/ollama/run-with-safety.yaml
+++ b/llama_stack/templates/ollama/run-with-safety.yaml
@@ -113,7 +113,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml
index 3cc1cb2ac6..eff648f039 100644
--- a/llama_stack/templates/ollama/run.yaml
+++ b/llama_stack/templates/ollama/run.yaml
@@ -102,7 +102,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml
index 4a0fa9a857..97a4701aa2 100644
--- a/llama_stack/templates/remote-vllm/run-with-safety.yaml
+++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml
@@ -118,7 +118,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml
index 9631f94a2f..1456f30880 100644
--- a/llama_stack/templates/remote-vllm/run.yaml
+++ b/llama_stack/templates/remote-vllm/run.yaml
@@ -107,7 +107,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/sambanova/run.yaml b/llama_stack/templates/sambanova/run.yaml
index 6cec51824c..9d29ff0c9b 100644
--- a/llama_stack/templates/sambanova/run.yaml
+++ b/llama_stack/templates/sambanova/run.yaml
@@ -118,7 +118,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml
index 503505c326..322bc455ea 100644
--- a/llama_stack/templates/tgi/run-with-safety.yaml
+++ b/llama_stack/templates/tgi/run-with-safety.yaml
@@ -106,7 +106,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml
index f1953c513e..3cff45e4e1 100644
--- a/llama_stack/templates/tgi/run.yaml
+++ b/llama_stack/templates/tgi/run.yaml
@@ -105,7 +105,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/together/run-with-safety.yaml b/llama_stack/templates/together/run-with-safety.yaml
index ec351108e5..39daf64810 100644
--- a/llama_stack/templates/together/run-with-safety.yaml
+++ b/llama_stack/templates/together/run-with-safety.yaml
@@ -159,7 +159,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml
index c2afd98e9b..effd7b9cd9 100644
--- a/llama_stack/templates/together/run.yaml
+++ b/llama_stack/templates/together/run.yaml
@@ -148,7 +148,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml
index 165e4d51db..0cc03c7eef 100644
--- a/llama_stack/templates/vllm-gpu/run.yaml
+++ b/llama_stack/templates/vllm-gpu/run.yaml
@@ -109,7 +109,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search

From b20742fce742cf19b8293e12bff541adbb03047d Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Wed, 12 Feb 2025 20:31:42 -0800
Subject: [PATCH 08/31] replace

---
 llama_stack/distribution/ui/page/distribution/eval_tasks.py | 4 ++--
 llama_stack/distribution/ui/page/distribution/resources.py  | 4 ++--
 llama_stack/distribution/ui/page/evaluations/native_eval.py | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/llama_stack/distribution/ui/page/distribution/eval_tasks.py b/llama_stack/distribution/ui/page/distribution/eval_tasks.py
index b83023901d..1428ae9ab2 100644
--- a/llama_stack/distribution/ui/page/distribution/eval_tasks.py
+++ b/llama_stack/distribution/ui/page/distribution/eval_tasks.py
@@ -9,8 +9,8 @@
 
 
 def benchmarks():
-    # Eval Tasks Section
-    st.header("Eval Tasks")
+    # Benchmarks Section
+    st.header("Benchmarks")
 
     benchmarks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.benchmarks.list()}
 
diff --git a/llama_stack/distribution/ui/page/distribution/resources.py b/llama_stack/distribution/ui/page/distribution/resources.py
index a86fda8565..c0d4aac68e 100644
--- a/llama_stack/distribution/ui/page/distribution/resources.py
+++ b/llama_stack/distribution/ui/page/distribution/resources.py
@@ -21,7 +21,7 @@ def resources_page():
         "Shields",
         "Scoring Functions",
         "Datasets",
-        "Eval Tasks",
+        "Benchmarks",
     ]
     icons = ["magic", "memory", "shield", "file-bar-graph", "database", "list-task"]
     selected_resource = option_menu(
@@ -35,7 +35,7 @@ def resources_page():
             },
         },
     )
-    if selected_resource == "Eval Tasks":
+    if selected_resource == "Benchmarks":
         benchmarks()
     elif selected_resource == "Vector Databases":
         vector_dbs()
diff --git a/llama_stack/distribution/ui/page/evaluations/native_eval.py b/llama_stack/distribution/ui/page/evaluations/native_eval.py
index e24da4eb38..39385dd140 100644
--- a/llama_stack/distribution/ui/page/evaluations/native_eval.py
+++ b/llama_stack/distribution/ui/page/evaluations/native_eval.py
@@ -14,7 +14,7 @@
 
 
 def select_benchmark_1():
-    # Select Eval Tasks
+    # Select Benchmarks
     st.subheader("1. Choose An Eval Task")
     benchmarks = llama_stack_api.client.benchmarks.list()
     benchmarks = {et.identifier: et for et in benchmarks}

From 017d24fe6561005b2debdddb5935e4c475629862 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Wed, 12 Feb 2025 20:34:32 -0800
Subject: [PATCH 09/31] replace task_id -> benchmark_id

---
 docs/_static/llama-stack-spec.html            | 26 +++---
 docs/_static/llama-stack-spec.yaml            | 26 +++---
 .../Llama_Stack_Benchmark_Evals.ipynb         |  6 +-
 docs/source/building_applications/evals.md    |  6 +-
 .../building_applications/evaluation.md       |  4 +-
 .../references/evals_reference/index.md       |  6 +-
 .../references/python_sdk_reference/index.md  | 10 +--
 llama_stack/apis/eval/eval.py                 | 20 ++---
 llama_stack/apis/eval_tasks/__init__.py       |  7 --
 llama_stack/apis/eval_tasks/eval_tasks.py     | 86 -------------------
 llama_stack/cli/download.py                   | 14 +--
 llama_stack/cli/verify_download.py            |  4 +-
 llama_stack/distribution/routers/routers.py   | 28 +++---
 .../distribution/routers/routing_tables.py    | 10 +--
 .../ui/page/evaluations/native_eval.py        |  2 +-
 .../inline/eval/meta_reference/eval.py        | 16 ++--
 16 files changed, 89 insertions(+), 182 deletions(-)
 delete mode 100644 llama_stack/apis/eval_tasks/__init__.py
 delete mode 100644 llama_stack/apis/eval_tasks/eval_tasks.py

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 84c6fd99df..c656808a67 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -613,7 +613,7 @@
                 }
             }
         },
-        "/v1/eval/tasks/{task_id}/evaluations": {
+        "/v1/eval/tasks/{benchmark_id}/evaluations": {
             "post": {
                 "responses": {
                     "200": {
@@ -633,7 +633,7 @@
                 "description": "",
                 "parameters": [
                     {
-                        "name": "task_id",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -811,7 +811,7 @@
                 ]
             }
         },
-        "/v1/eval/tasks/{task_id}": {
+        "/v1/eval/tasks/{benchmark_id}": {
             "get": {
                 "responses": {
                     "200": {
@@ -838,7 +838,7 @@
                 "description": "",
                 "parameters": [
                     {
-                        "name": "task_id",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -1431,7 +1431,7 @@
                 }
             }
         },
-        "/v1/eval/tasks/{task_id}/jobs/{job_id}": {
+        "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}": {
             "get": {
                 "responses": {
                     "200": {
@@ -1458,7 +1458,7 @@
                 "description": "",
                 "parameters": [
                     {
-                        "name": "task_id",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -1487,7 +1487,7 @@
                 "description": "",
                 "parameters": [
                     {
-                        "name": "task_id",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -1505,7 +1505,7 @@
                 ]
             }
         },
-        "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": {
+        "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result": {
             "get": {
                 "responses": {
                     "200": {
@@ -1533,7 +1533,7 @@
                         }
                     },
                     {
-                        "name": "task_id",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -2204,7 +2204,7 @@
                 ]
             }
         },
-        "/v1/eval/tasks/{task_id}/jobs": {
+        "/v1/eval/tasks/{benchmark_id}/jobs": {
             "post": {
                 "responses": {
                     "200": {
@@ -2224,7 +2224,7 @@
                 "description": "",
                 "parameters": [
                     {
-                        "name": "task_id",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -7361,7 +7361,7 @@
             "RegisterBenchmarkRequest": {
                 "type": "object",
                 "properties": {
-                    "task_id": {
+                    "benchmark_id": {
                         "type": "string"
                     },
                     "dataset_id": {
@@ -7407,7 +7407,7 @@
                 },
                 "additionalProperties": false,
                 "required": [
-                    "task_id",
+                    "benchmark_id",
                     "dataset_id",
                     "scoring_functions"
                 ]
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index dd0951fdec..0f0a613a81 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -372,7 +372,7 @@ paths:
             schema:
               $ref: '#/components/schemas/EmbeddingsRequest'
         required: true
-  /v1/eval/tasks/{task_id}/evaluations:
+  /v1/eval/tasks/{benchmark_id}/evaluations:
     post:
       responses:
         '200':
@@ -385,7 +385,7 @@ paths:
         - Eval
       description: ''
       parameters:
-        - name: task_id
+        - name: benchmark_id
           in: path
           required: true
           schema:
@@ -490,7 +490,7 @@ paths:
           required: true
           schema:
             type: string
-  /v1/eval/tasks/{task_id}:
+  /v1/eval/tasks/{benchmark_id}:
     get:
       responses:
         '200':
@@ -505,7 +505,7 @@ paths:
         - Benchmarks
       description: ''
       parameters:
-        - name: task_id
+        - name: benchmark_id
           in: path
           required: true
           schema:
@@ -852,7 +852,7 @@ paths:
             schema:
               $ref: '#/components/schemas/InvokeToolRequest'
         required: true
-  /v1/eval/tasks/{task_id}/jobs/{job_id}:
+  /v1/eval/tasks/{benchmark_id}/jobs/{job_id}:
     get:
       responses:
         '200':
@@ -867,7 +867,7 @@ paths:
         - Eval
       description: ''
       parameters:
-        - name: task_id
+        - name: benchmark_id
           in: path
           required: true
           schema:
@@ -885,7 +885,7 @@ paths:
         - Eval
       description: ''
       parameters:
-        - name: task_id
+        - name: benchmark_id
           in: path
           required: true
           schema:
@@ -895,7 +895,7 @@ paths:
           required: true
           schema:
             type: string
-  /v1/eval/tasks/{task_id}/jobs/{job_id}/result:
+  /v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result:
     get:
       responses:
         '200':
@@ -913,7 +913,7 @@ paths:
           required: true
           schema:
             type: string
-        - name: task_id
+        - name: benchmark_id
           in: path
           required: true
           schema:
@@ -1328,7 +1328,7 @@ paths:
             type: array
             items:
               type: string
-  /v1/eval/tasks/{task_id}/jobs:
+  /v1/eval/tasks/{benchmark_id}/jobs:
     post:
       responses:
         '200':
@@ -1341,7 +1341,7 @@ paths:
         - Eval
       description: ''
       parameters:
-        - name: task_id
+        - name: benchmark_id
           in: path
           required: true
           schema:
@@ -4678,7 +4678,7 @@ components:
     RegisterBenchmarkRequest:
       type: object
       properties:
-        task_id:
+        benchmark_id:
           type: string
         dataset_id:
           type: string
@@ -4702,7 +4702,7 @@ components:
               - type: object
       additionalProperties: false
       required:
-        - task_id
+        - benchmark_id
         - dataset_id
         - scoring_functions
     RegisterModelRequest:
diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
index 6e8480f945..599df201a0 100644
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@@ -1024,7 +1024,7 @@
         ")\n",
         "\n",
         "response = client.eval.evaluate_rows(\n",
-        "    task_id=\"meta-reference::mmmu\",\n",
+        "    benchmark_id=\"meta-reference::mmmu\",\n",
         "    input_rows=eval_rows,\n",
         "    scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
         "    task_config={\n",
@@ -1203,7 +1203,7 @@
         ")\n",
         "\n",
         "response = client.eval.evaluate_rows(\n",
-        "    task_id=\"meta-reference::simpleqa\",\n",
+        "    benchmark_id=\"meta-reference::simpleqa\",\n",
         "    input_rows=eval_rows.rows,\n",
         "    scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
         "    task_config={\n",
@@ -1352,7 +1352,7 @@
         "}\n",
         "\n",
         "response = client.eval.evaluate_rows(\n",
-        "    task_id=\"meta-reference::simpleqa\",\n",
+        "    benchmark_id=\"meta-reference::simpleqa\",\n",
         "    input_rows=eval_rows.rows,\n",
         "    scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
         "    task_config={\n",
diff --git a/docs/source/building_applications/evals.md b/docs/source/building_applications/evals.md
index c1c371ca80..f28e0d5fd7 100644
--- a/docs/source/building_applications/evals.md
+++ b/docs/source/building_applications/evals.md
@@ -48,7 +48,7 @@ client.benchmarks.register(
 )
 
 response = client.eval.evaluate_rows(
-    task_id="meta-reference::mmmu",
+    benchmark_id="meta-reference::mmmu",
     input_rows=eval_rows,
     scoring_functions=["basic::regex_parser_multiple_choice_answer"],
     task_config={
@@ -106,7 +106,7 @@ client.benchmarks.register(
 )
 
 response = client.eval.evaluate_rows(
-    task_id="meta-reference::simpleqa",
+    benchmark_id="meta-reference::simpleqa",
     input_rows=eval_rows.rows,
     scoring_functions=["llm-as-judge::405b-simpleqa"],
     task_config={
@@ -156,7 +156,7 @@ agent_config = {
 }
 
 response = client.eval.evaluate_rows(
-    task_id="meta-reference::simpleqa",
+    benchmark_id="meta-reference::simpleqa",
     input_rows=eval_rows.rows,
     scoring_functions=["llm-as-judge::405b-simpleqa"],
     task_config={
diff --git a/docs/source/building_applications/evaluation.md b/docs/source/building_applications/evaluation.md
index df18c146cc..ad220f7518 100644
--- a/docs/source/building_applications/evaluation.md
+++ b/docs/source/building_applications/evaluation.md
@@ -18,7 +18,7 @@ response = client.benchmarks.register(
 
 # Run evaluation
 job = client.eval.run_eval(
-    task_id="my_eval",
+    benchmark_id="my_eval",
     task_config={
         "type": "app",
         "eval_candidate": {"type": "agent", "config": agent_config},
@@ -26,5 +26,5 @@ job = client.eval.run_eval(
 )
 
 # Get results
-result = client.eval.job_result(task_id="my_eval", job_id=job.job_id)
+result = client.eval.job_result(benchmark_id="my_eval", job_id=job.job_id)
 ```
diff --git a/docs/source/references/evals_reference/index.md b/docs/source/references/evals_reference/index.md
index f0275511df..71dbb47e59 100644
--- a/docs/source/references/evals_reference/index.md
+++ b/docs/source/references/evals_reference/index.md
@@ -84,7 +84,7 @@ client.benchmarks.register(
 )
 
 response = client.eval.evaluate_rows(
-    task_id="meta-reference::mmmu",
+    benchmark_id="meta-reference::mmmu",
     input_rows=eval_rows,
     scoring_functions=["basic::regex_parser_multiple_choice_answer"],
     task_config={
@@ -142,7 +142,7 @@ client.benchmarks.register(
 )
 
 response = client.eval.evaluate_rows(
-    task_id="meta-reference::simpleqa",
+    benchmark_id="meta-reference::simpleqa",
     input_rows=eval_rows.rows,
     scoring_functions=["llm-as-judge::405b-simpleqa"],
     task_config={
@@ -192,7 +192,7 @@ agent_config = {
 }
 
 response = client.eval.evaluate_rows(
-    task_id="meta-reference::simpleqa",
+    benchmark_id="meta-reference::simpleqa",
     input_rows=eval_rows.rows,
     scoring_functions=["llm-as-judge::405b-simpleqa"],
     task_config={
diff --git a/docs/source/references/python_sdk_reference/index.md b/docs/source/references/python_sdk_reference/index.md
index eca8c58f54..9d1130422f 100644
--- a/docs/source/references/python_sdk_reference/index.md
+++ b/docs/source/references/python_sdk_reference/index.md
@@ -181,8 +181,8 @@ from llama_stack_client.types import EvaluateResponse, Job
 
 Methods:
 
-- <code title="post /v1/eval/tasks/{task_id}/evaluations">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">evaluate_rows</a>(task_id, \*\*<a href="src/llama_stack_client/types/eval_evaluate_rows_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
-- <code title="post /v1/eval/tasks/{task_id}/jobs">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">run_eval</a>(task_id, \*\*<a href="src/llama_stack_client/types/eval_run_eval_params.py">params</a>) -> <a href="./src/llama_stack_client/types/job.py">Job</a></code>
+- <code title="post /v1/eval/tasks/{benchmark_id}/evaluations">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">evaluate_rows</a>(benchmark_id, \*\*<a href="src/llama_stack_client/types/eval_evaluate_rows_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
+- <code title="post /v1/eval/tasks/{benchmark_id}/jobs">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">run_eval</a>(benchmark_id, \*\*<a href="src/llama_stack_client/types/eval_run_eval_params.py">params</a>) -> <a href="./src/llama_stack_client/types/job.py">Job</a></code>
 
 ### Jobs
 
@@ -194,9 +194,9 @@ from llama_stack_client.types.eval import JobStatusResponse
 
 Methods:
 
-- <code title="get /v1/eval/tasks/{task_id}/jobs/{job_id}/result">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">retrieve</a>(job_id, \*, task_id) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
-- <code title="delete /v1/eval/tasks/{task_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">cancel</a>(job_id, \*, task_id) -> None</code>
-- <code title="get /v1/eval/tasks/{task_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">status</a>(job_id, \*, task_id) -> Optional[JobStatusResponse]</code>
+- <code title="get /v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">retrieve</a>(job_id, \*, benchmark_id) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
+- <code title="delete /v1/eval/tasks/{benchmark_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">cancel</a>(job_id, \*, benchmark_id) -> None</code>
+- <code title="get /v1/eval/tasks/{benchmark_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">status</a>(job_id, \*, benchmark_id) -> Optional[JobStatusResponse]</code>
 
 ## Inspect
 
diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py
index 16b96d618b..273ef657c1 100644
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@@ -76,27 +76,27 @@ class EvaluateResponse(BaseModel):
 
 
 class Eval(Protocol):
-    @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
+    @webmethod(route="/eval/tasks/{benchmark_id}/jobs", method="POST")
     async def run_eval(
         self,
-        task_id: str,
+        benchmark_id: str,
         task_config: BenchmarkConfig,
     ) -> Job: ...
 
-    @webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST")
+    @webmethod(route="/eval/tasks/{benchmark_id}/evaluations", method="POST")
     async def evaluate_rows(
         self,
-        task_id: str,
+        benchmark_id: str,
         input_rows: List[Dict[str, Any]],
         scoring_functions: List[str],
         task_config: BenchmarkConfig,
     ) -> EvaluateResponse: ...
 
-    @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET")
-    async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
+    @webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}", method="GET")
+    async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: ...
 
-    @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE")
-    async def job_cancel(self, task_id: str, job_id: str) -> None: ...
+    @webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}", method="DELETE")
+    async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ...
 
-    @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET")
-    async def job_result(self, job_id: str, task_id: str) -> EvaluateResponse: ...
+    @webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}/result", method="GET")
+    async def job_result(self, job_id: str, benchmark_id: str) -> EvaluateResponse: ...
diff --git a/llama_stack/apis/eval_tasks/__init__.py b/llama_stack/apis/eval_tasks/__init__.py
deleted file mode 100644
index f8f5649570..0000000000
--- a/llama_stack/apis/eval_tasks/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .benchmarks import *  # noqa: F401 F403
diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py
deleted file mode 100644
index 7c8ed8dc04..0000000000
--- a/llama_stack/apis/eval_tasks/eval_tasks.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
-
-from llama_models.schema_utils import json_schema_type, webmethod
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.resource import Resource, ResourceType
-
-
-class CommonBenchmarkFields(BaseModel):
-    dataset_id: str
-    scoring_functions: List[str]
-    metadata: Dict[str, Any] = Field(
-        default_factory=dict,
-        description="Metadata for this evaluation task",
-    )
-
-
-@json_schema_type
-class Benchmark(CommonBenchmarkFields, Resource):
-    type: Literal[ResourceType.benchmark.value] = ResourceType.benchmark.value
-
-    @property
-    def task_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_benchmark_id(self) -> str:
-        return self.provider_resource_id
-
-
-class BenchmarkInput(CommonBenchmarkFields, BaseModel):
-    task_id: str
-    provider_id: Optional[str] = None
-    provider_benchmark_id: Optional[str] = None
-
-
-class ListBenchmarksResponse(BaseModel):
-    data: List[Benchmark]
-
-
-@runtime_checkable
-class Benchmarks(Protocol):
-    @webmethod(route="/eval/tasks", method="GET")
-    async def list_benchmarks(self) -> ListBenchmarksResponse: ...
-
-    @webmethod(route="/eval/tasks/{task_id}", method="GET")
-    async def get_benchmark(
-        self,
-        task_id: str,
-    ) -> Optional[Benchmark]: ...
-
-    @webmethod(route="/eval/tasks", method="POST")
-    async def register_benchmark(
-        self,
-        task_id: str,
-        dataset_id: str,
-        scoring_functions: List[str],
-        provider_benchmark_id: Optional[str] = None,
-        provider_id: Optional[str] = None,
-        metadata: Optional[Dict[str, Any]] = None,
-    ) -> None: ...
-
-    @webmethod(route="/eval-tasks", method="GET")
-    async def DEPRECATED_list_benchmarks(self) -> ListBenchmarksResponse: ...
-
-    @webmethod(route="/eval-tasks/{benchmark_id}", method="GET")
-    async def DEPRECATED_get_benchmark(
-        self,
-        benchmark_id: str,
-    ) -> Optional[Benchmark]: ...
-
-    @webmethod(route="/eval-tasks", method="POST")
-    async def DEPRECATED_register_benchmark(
-        self,
-        benchmark_id: str,
-        dataset_id: str,
-        scoring_functions: List[str],
-        provider_benchmark_id: Optional[str] = None,
-        provider_id: Optional[str] = None,
-        metadata: Optional[Dict[str, Any]] = None,
-    ) -> None: ...
diff --git a/llama_stack/cli/download.py b/llama_stack/cli/download.py
index 379ac49caa..7b9b303f48 100644
--- a/llama_stack/cli/download.py
+++ b/llama_stack/cli/download.py
@@ -105,7 +105,7 @@ class DownloadTask:
     output_file: str
     total_size: int = 0
     downloaded_size: int = 0
-    task_id: Optional[int] = None
+    benchmark_id: Optional[int] = None
     retries: int = 0
     max_retries: int = 3
 
@@ -183,8 +183,8 @@ async def _get_info():
                 )
 
             # Update the progress bar's total size once we know it
-            if task.task_id is not None:
-                self.progress.update(task.task_id, total=task.total_size)
+            if task.benchmark_id is not None:
+                self.progress.update(task.benchmark_id, total=task.total_size)
 
         except httpx.HTTPError as e:
             self.console.print(f"[red]Error getting file info: {str(e)}[/red]")
@@ -207,7 +207,7 @@ async def _download_chunk():
                         file.write(chunk)
                         task.downloaded_size += len(chunk)
                         self.progress.update(
-                            task.task_id,
+                            task.benchmark_id,
                             completed=task.downloaded_size,
                         )
 
@@ -234,7 +234,7 @@ async def download_file(self, task: DownloadTask) -> None:
                 if os.path.exists(task.output_file):
                     if self.verify_file_integrity(task):
                         self.console.print(f"[green]Already downloaded {task.output_file}[/green]")
-                        self.progress.update(task.task_id, completed=task.total_size)
+                        self.progress.update(task.benchmark_id, completed=task.total_size)
                         return
 
                 await self.prepare_download(task)
@@ -258,7 +258,7 @@ async def download_file(self, task: DownloadTask) -> None:
                     raise DownloadError(f"Download failed: {str(e)}") from e
 
         except Exception as e:
-            self.progress.update(task.task_id, description=f"[red]Failed: {task.output_file}[/red]")
+            self.progress.update(task.benchmark_id, description=f"[red]Failed: {task.output_file}[/red]")
             raise DownloadError(f"Download failed for {task.output_file}: {str(e)}") from e
 
     def has_disk_space(self, tasks: List[DownloadTask]) -> bool:
@@ -293,7 +293,7 @@ async def download_all(self, tasks: List[DownloadTask]) -> None:
         with self.progress:
             for task in tasks:
                 desc = f"Downloading {Path(task.output_file).name}"
-                task.task_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size)
+                task.benchmark_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size)
 
             semaphore = asyncio.Semaphore(self.max_concurrent_downloads)
 
diff --git a/llama_stack/cli/verify_download.py b/llama_stack/cli/verify_download.py
index 47993c3613..ca72ca5818 100644
--- a/llama_stack/cli/verify_download.py
+++ b/llama_stack/cli/verify_download.py
@@ -82,7 +82,7 @@ def verify_files(model_dir: Path, checksums: Dict[str, str], console: Console) -
     ) as progress:
         for filepath, expected_hash in checksums.items():
             full_path = model_dir / filepath
-            task_id = progress.add_task(f"Verifying {filepath}...", total=None)
+            benchmark_id = progress.add_task(f"Verifying {filepath}...", total=None)
 
             exists = full_path.exists()
             actual_hash = None
@@ -102,7 +102,7 @@ def verify_files(model_dir: Path, checksums: Dict[str, str], console: Console) -
                 )
             )
 
-            progress.remove_task(task_id)
+            progress.remove_task(benchmark_id)
 
     return results
 
diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py
index 6697b03e26..f9f3067670 100644
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@@ -347,23 +347,23 @@ async def shutdown(self) -> None:
 
     async def run_eval(
         self,
-        task_id: str,
+        benchmark_id: str,
         task_config: AppBenchmarkConfig,
     ) -> Job:
-        return await self.routing_table.get_provider_impl(task_id).run_eval(
-            task_id=task_id,
+        return await self.routing_table.get_provider_impl(benchmark_id).run_eval(
+            benchmark_id=benchmark_id,
             task_config=task_config,
         )
 
     async def evaluate_rows(
         self,
-        task_id: str,
+        benchmark_id: str,
         input_rows: List[Dict[str, Any]],
         scoring_functions: List[str],
         task_config: BenchmarkConfig,
     ) -> EvaluateResponse:
-        return await self.routing_table.get_provider_impl(task_id).evaluate_rows(
-            task_id=task_id,
+        return await self.routing_table.get_provider_impl(benchmark_id).evaluate_rows(
+            benchmark_id=benchmark_id,
             input_rows=input_rows,
             scoring_functions=scoring_functions,
             task_config=task_config,
@@ -371,28 +371,28 @@ async def evaluate_rows(
 
     async def job_status(
         self,
-        task_id: str,
+        benchmark_id: str,
         job_id: str,
     ) -> Optional[JobStatus]:
-        return await self.routing_table.get_provider_impl(task_id).job_status(task_id, job_id)
+        return await self.routing_table.get_provider_impl(benchmark_id).job_status(benchmark_id, job_id)
 
     async def job_cancel(
         self,
-        task_id: str,
+        benchmark_id: str,
         job_id: str,
     ) -> None:
-        await self.routing_table.get_provider_impl(task_id).job_cancel(
-            task_id,
+        await self.routing_table.get_provider_impl(benchmark_id).job_cancel(
+            benchmark_id,
             job_id,
         )
 
     async def job_result(
         self,
-        task_id: str,
+        benchmark_id: str,
         job_id: str,
     ) -> EvaluateResponse:
-        return await self.routing_table.get_provider_impl(task_id).job_result(
-            task_id,
+        return await self.routing_table.get_provider_impl(benchmark_id).job_result(
+            benchmark_id,
             job_id,
         )
 
diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index 6c1b06ed6b..a52ab7fbdf 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -433,12 +433,12 @@ class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
     async def list_benchmarks(self) -> ListBenchmarksResponse:
         return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark"))
 
-    async def get_benchmark(self, task_id: str) -> Optional[Benchmark]:
-        return await self.get_object_by_identifier("benchmark", task_id)
+    async def get_benchmark(self, benchmark_id: str) -> Optional[Benchmark]:
+        return await self.get_object_by_identifier("benchmark", benchmark_id)
 
     async def register_benchmark(
         self,
-        task_id: str,
+        benchmark_id: str,
         dataset_id: str,
         scoring_functions: List[str],
         metadata: Optional[Dict[str, Any]] = None,
@@ -455,9 +455,9 @@ async def register_benchmark(
                     "No provider specified and multiple providers available. Please specify a provider_id."
                 )
         if provider_benchmark_id is None:
-            provider_benchmark_id = task_id
+            provider_benchmark_id = benchmark_id
         benchmark = Benchmark(
-            identifier=task_id,
+            identifier=benchmark_id,
             dataset_id=dataset_id,
             scoring_functions=scoring_functions,
             metadata=metadata,
diff --git a/llama_stack/distribution/ui/page/evaluations/native_eval.py b/llama_stack/distribution/ui/page/evaluations/native_eval.py
index 39385dd140..753c574a28 100644
--- a/llama_stack/distribution/ui/page/evaluations/native_eval.py
+++ b/llama_stack/distribution/ui/page/evaluations/native_eval.py
@@ -211,7 +211,7 @@ def run_evaluation_3():
             progress_bar.progress(progress, text=progress_text)
             # Run evaluation for current row
             eval_res = llama_stack_api.client.eval.evaluate_rows(
-                task_id=selected_benchmark,
+                benchmark_id=selected_benchmark,
                 input_rows=[r],
                 scoring_functions=benchmarks[selected_benchmark].scoring_functions,
                 task_config=benchmark_config,
diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py
index 07310f59c0..a02418e741 100644
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@@ -83,10 +83,10 @@ async def register_benchmark(self, task_def: Benchmark) -> None:
 
     async def run_eval(
         self,
-        task_id: str,
+        benchmark_id: str,
         task_config: BenchmarkConfig,
     ) -> Job:
-        task_def = self.benchmarks[task_id]
+        task_def = self.benchmarks[benchmark_id]
         dataset_id = task_def.dataset_id
         candidate = task_config.eval_candidate
         scoring_functions = task_def.scoring_functions
@@ -97,7 +97,7 @@ async def run_eval(
             rows_in_page=(-1 if task_config.num_examples is None else task_config.num_examples),
         )
         res = await self.evaluate_rows(
-            task_id=task_id,
+            benchmark_id=benchmark_id,
             input_rows=all_rows.rows,
             scoring_functions=scoring_functions,
             task_config=task_config,
@@ -189,7 +189,7 @@ async def _run_model_generation(
 
     async def evaluate_rows(
         self,
-        task_id: str,
+        benchmark_id: str,
         input_rows: List[Dict[str, Any]],
         scoring_functions: List[str],
         task_config: BenchmarkConfig,
@@ -219,17 +219,17 @@ async def evaluate_rows(
 
         return EvaluateResponse(generations=generations, scores=score_response.results)
 
-    async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]:
+    async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]:
         if job_id in self.jobs:
             return JobStatus.completed
 
         return None
 
-    async def job_cancel(self, task_id: str, job_id: str) -> None:
+    async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
         raise NotImplementedError("Job cancel is not implemented yet")
 
-    async def job_result(self, task_id: str, job_id: str) -> EvaluateResponse:
-        status = await self.job_status(task_id, job_id)
+    async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
+        status = await self.job_status(benchmark_id, job_id)
         if not status or status != JobStatus.completed:
             raise ValueError(f"Job is not completed, Status: {status.value}")
 

From 8759196e29ccd3650da830e18c182e1ac1f2b37d Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Wed, 12 Feb 2025 20:38:21 -0800
Subject: [PATCH 10/31] benchmark config

---
 llama_stack/apis/eval/eval.py | 31 +++++++------------------------
 1 file changed, 7 insertions(+), 24 deletions(-)

diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py
index 273ef657c1..90b14131f4 100644
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@@ -38,18 +38,7 @@ class AgentCandidate(BaseModel):
 
 
 @json_schema_type
-class BenchmarkBenchmarkConfig(BaseModel):
-    type: Literal["benchmark"] = "benchmark"
-    eval_candidate: EvalCandidate
-    num_examples: Optional[int] = Field(
-        description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated",
-        default=None,
-    )
-
-
-@json_schema_type
-class AppBenchmarkConfig(BaseModel):
-    type: Literal["app"] = "app"
+class BenchmarkConfig(BaseModel):
     eval_candidate: EvalCandidate
     scoring_params: Dict[str, ScoringFnParams] = Field(
         description="Map between scoring function id and parameters for each scoring function you want to run",
@@ -62,12 +51,6 @@ class AppBenchmarkConfig(BaseModel):
     # we could optinally add any specific dataset config here
 
 
-BenchmarkConfig = register_schema(
-    Annotated[Union[BenchmarkBenchmarkConfig, AppBenchmarkConfig], Field(discriminator="type")],
-    name="BenchmarkConfig",
-)
-
-
 @json_schema_type
 class EvaluateResponse(BaseModel):
     generations: List[Dict[str, Any]]
@@ -76,14 +59,14 @@ class EvaluateResponse(BaseModel):
 
 
 class Eval(Protocol):
-    @webmethod(route="/eval/tasks/{benchmark_id}/jobs", method="POST")
+    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST")
     async def run_eval(
         self,
         benchmark_id: str,
         task_config: BenchmarkConfig,
     ) -> Job: ...
 
-    @webmethod(route="/eval/tasks/{benchmark_id}/evaluations", method="POST")
+    @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST")
     async def evaluate_rows(
         self,
         benchmark_id: str,
@@ -92,11 +75,11 @@ async def evaluate_rows(
         task_config: BenchmarkConfig,
     ) -> EvaluateResponse: ...
 
-    @webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}", method="GET")
+    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET")
     async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: ...
 
-    @webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}", method="DELETE")
+    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE")
     async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ...
 
-    @webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}/result", method="GET")
-    async def job_result(self, job_id: str, benchmark_id: str) -> EvaluateResponse: ...
+    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
+    async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ...

From e07776fff618b4db2f12ff008b4f3cd51feba8c2 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Wed, 12 Feb 2025 20:42:01 -0800
Subject: [PATCH 11/31] update

---
 .../distribution/routers/routing_tables.py    | 20 +++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index a52ab7fbdf..5d2da73372 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import logging
 from typing import Any, Dict, List, Optional
 
 from pydantic import TypeAdapter
@@ -39,6 +40,8 @@
 from llama_stack.distribution.store import DistributionRegistry
 from llama_stack.providers.datatypes import Api, RoutingTable
 
+logger = logging.getLogger(__name__)
+
 
 def get_impl_api(p: Any) -> Api:
     return p.__provider_spec__.api
@@ -466,16 +469,18 @@ async def register_benchmark(
         )
         await self.register_object(benchmark)
 
-    async def DEPRECATED_list_benchmarks(self) -> ListBenchmarksResponse:
+    async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse:
+        logger.warning("DEPRECATED: Use /eval/benchmarks instead")
         raise DeprecationWarning("Use /eval/tasks instead")
 
-    async def DEPRECATED_get_benchmark(
+    async def DEPRECATED_get_eval_task(
         self,
         benchmark_id: str,
     ) -> Optional[Benchmark]:
+        logger.warning("DEPRECATED: Use /eval/benchmarks instead")
         raise DeprecationWarning("Use /eval/tasks instead")
 
-    async def DEPRECATED_register_benchmark(
+    async def DEPRECATED_register_eval_task(
         self,
         benchmark_id: str,
         dataset_id: str,
@@ -484,7 +489,14 @@ async def DEPRECATED_register_benchmark(
         provider_id: Optional[str] = None,
         metadata: Optional[Dict[str, Any]] = None,
     ) -> None:
-        raise DeprecationWarning("Use /eval/tasks instead")
+        logger.warning("DEPRECATED: Use /eval/benchmarks instead")
+        self.register_benchmark(
+            benchmark_id=benchmark_id,
+            dataset_id=dataset_id,
+            scoring_functions=scoring_functions,
+            metadata=metadata,
+            provider_benchmark_id=provider_benchmark_id,
+        )
 
 
 class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):

From ec721b3867d664a486faebb6a2a2b7a77ecd0b71 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Wed, 12 Feb 2025 20:48:05 -0800
Subject: [PATCH 12/31] update

---
 docs/_static/llama-stack-spec.html          | 469 ++++++++++++++------
 docs/_static/llama-stack-spec.yaml          | 310 +++++++++----
 llama_stack/apis/eval/eval.py               |  25 ++
 llama_stack/distribution/routers/routers.py |   3 +-
 4 files changed, 584 insertions(+), 223 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index c656808a67..652dae562c 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -67,8 +67,8 @@
                 "description": "",
                 "parameters": [
                     {
-                        "name": "benchmark_id",
-                        "in": "path",
+                        "name": "task_id",
+                        "in": "query",
                         "required": true,
                         "schema": {
                             "type": "string"
@@ -114,7 +114,7 @@
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/DeprecatedRegisterBenchmarkRequest"
+                                "$ref": "#/components/schemas/DeprecatedRegisterEvalTaskRequest"
                             }
                         }
                     },
@@ -613,7 +613,7 @@
                 }
             }
         },
-        "/v1/eval/tasks/{benchmark_id}/evaluations": {
+        "/v1/eval/benchmarks/{benchmark_id}/evaluations": {
             "post": {
                 "responses": {
                     "200": {
@@ -653,6 +653,47 @@
                 }
             }
         },
+        "/v1/eval/tasks/{task_id}/evaluations": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/EvaluateResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/EvaluateRowsDeprecatedRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                },
+                "deprecated": true
+            }
+        },
         "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}": {
             "get": {
                 "responses": {
@@ -753,7 +794,7 @@
                 ]
             }
         },
-        "/v1/datasets/{dataset_id}": {
+        "/v1/eval/benchmarks/{benchmark_id}": {
             "get": {
                 "responses": {
                     "200": {
@@ -763,7 +804,7 @@
                                 "schema": {
                                     "oneOf": [
                                         {
-                                            "$ref": "#/components/schemas/Dataset"
+                                            "$ref": "#/components/schemas/Benchmark"
                                         },
                                         {
                                             "type": "null"
@@ -775,12 +816,12 @@
                     }
                 },
                 "tags": [
-                    "Datasets"
+                    "Benchmarks"
                 ],
                 "description": "",
                 "parameters": [
                     {
-                        "name": "dataset_id",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -788,11 +829,27 @@
                         }
                     }
                 ]
-            },
-            "delete": {
+            }
+        },
+        "/v1/datasets/{dataset_id}": {
+            "get": {
                 "responses": {
                     "200": {
-                        "description": "OK"
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/Dataset"
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
+                                }
+                            }
+                        }
                     }
                 },
                 "tags": [
@@ -809,36 +866,20 @@
                         }
                     }
                 ]
-            }
-        },
-        "/v1/eval/tasks/{benchmark_id}": {
-            "get": {
+            },
+            "delete": {
                 "responses": {
                     "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/Benchmark"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
-                                }
-                            }
-                        }
+                        "description": "OK"
                     }
                 },
                 "tags": [
-                    "Benchmarks"
+                    "Datasets"
                 ],
                 "description": "",
                 "parameters": [
                     {
-                        "name": "benchmark_id",
+                        "name": "dataset_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -1431,7 +1472,7 @@
                 }
             }
         },
-        "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}": {
+        "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": {
             "get": {
                 "responses": {
                     "200": {
@@ -1505,7 +1546,7 @@
                 ]
             }
         },
-        "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result": {
+        "/v1/eval/tasks/{task_id}/jobs/{job_id}": {
             "get": {
                 "responses": {
                     "200": {
@@ -1513,7 +1554,14 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/EvaluateResponse"
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/JobStatus"
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
                                 }
                             }
                         }
@@ -1524,6 +1572,14 @@
                 ],
                 "description": "",
                 "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
                     {
                         "name": "job_id",
                         "in": "path",
@@ -1531,7 +1587,60 @@
                         "schema": {
                             "type": "string"
                         }
+                    }
+                ],
+                "deprecated": true
+            },
+            "delete": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
                     },
+                    {
+                        "name": "job_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "deprecated": true
+            }
+        },
+        "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/EvaluateResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "",
+                "parameters": [
                     {
                         "name": "benchmark_id",
                         "in": "path",
@@ -1539,11 +1648,19 @@
                         "schema": {
                             "type": "string"
                         }
+                    },
+                    {
+                        "name": "job_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
                     }
                 ]
             }
         },
-        "/v1/datasets": {
+        "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": {
             "get": {
                 "responses": {
                     "200": {
@@ -1551,14 +1668,53 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/ListDatasetsResponse"
+                                    "$ref": "#/components/schemas/EvaluateResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Datasets"
+                    "Eval"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "job_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "deprecated": true
+            }
+        },
+        "/v1/eval/benchmarks": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ListBenchmarksResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Benchmarks"
                 ],
                 "description": "",
                 "parameters": []
@@ -1570,7 +1726,7 @@
                     }
                 },
                 "tags": [
-                    "Datasets"
+                    "Benchmarks"
                 ],
                 "description": "",
                 "parameters": [],
@@ -1578,7 +1734,7 @@
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/RegisterDatasetRequest"
+                                "$ref": "#/components/schemas/RegisterBenchmarkRequest"
                             }
                         }
                     },
@@ -1586,7 +1742,7 @@
                 }
             }
         },
-        "/v1/eval/tasks": {
+        "/v1/datasets": {
             "get": {
                 "responses": {
                     "200": {
@@ -1594,14 +1750,14 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/ListBenchmarksResponse"
+                                    "$ref": "#/components/schemas/ListDatasetsResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Benchmarks"
+                    "Datasets"
                 ],
                 "description": "",
                 "parameters": []
@@ -1613,7 +1769,7 @@
                     }
                 },
                 "tags": [
-                    "Benchmarks"
+                    "Datasets"
                 ],
                 "description": "",
                 "parameters": [],
@@ -1621,7 +1777,7 @@
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/RegisterBenchmarkRequest"
+                                "$ref": "#/components/schemas/RegisterDatasetRequest"
                             }
                         }
                     },
@@ -2204,7 +2360,7 @@
                 ]
             }
         },
-        "/v1/eval/tasks/{benchmark_id}/jobs": {
+        "/v1/eval/benchmarks/{benchmark_id}/jobs": {
             "post": {
                 "responses": {
                     "200": {
@@ -2244,6 +2400,47 @@
                 }
             }
         },
+        "/v1/eval/tasks/{task_id}/jobs": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Job"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/RunEvalDeprecatedRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                },
+                "deprecated": true
+            }
+        },
         "/v1/safety/run-shield": {
             "post": {
                 "responses": {
@@ -2526,10 +2723,10 @@
                     "data"
                 ]
             },
-            "DeprecatedRegisterBenchmarkRequest": {
+            "DeprecatedRegisterEvalTaskRequest": {
                 "type": "object",
                 "properties": {
-                    "benchmark_id": {
+                    "task_id": {
                         "type": "string"
                     },
                     "dataset_id": {
@@ -2575,7 +2772,7 @@
                 },
                 "additionalProperties": false,
                 "required": [
-                    "benchmark_id",
+                    "task_id",
                     "dataset_id",
                     "scoring_functions"
                 ]
@@ -4745,34 +4942,6 @@
                     "accuracy"
                 ]
             },
-            "AppBenchmarkConfig": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "app",
-                        "default": "app"
-                    },
-                    "eval_candidate": {
-                        "$ref": "#/components/schemas/EvalCandidate"
-                    },
-                    "scoring_params": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ScoringFnParams"
-                        }
-                    },
-                    "num_examples": {
-                        "type": "integer"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "eval_candidate",
-                    "scoring_params"
-                ]
-            },
             "BasicScoringFnParams": {
                 "type": "object",
                 "properties": {
@@ -4793,25 +4962,26 @@
                     "type"
                 ]
             },
-            "BenchmarkBenchmarkConfig": {
+            "BenchmarkConfig": {
                 "type": "object",
                 "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "benchmark",
-                        "default": "benchmark"
-                    },
                     "eval_candidate": {
                         "$ref": "#/components/schemas/EvalCandidate"
                     },
+                    "scoring_params": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/ScoringFnParams"
+                        }
+                    },
                     "num_examples": {
                         "type": "integer"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "type",
-                    "eval_candidate"
+                    "eval_candidate",
+                    "scoring_params"
                 ]
             },
             "EvalCandidate": {
@@ -4831,23 +5001,6 @@
                     }
                 }
             },
-            "BenchmarkConfig": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/BenchmarkBenchmarkConfig"
-                    },
-                    {
-                        "$ref": "#/components/schemas/AppBenchmarkConfig"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "benchmark": "#/components/schemas/BenchmarkBenchmarkConfig",
-                        "app": "#/components/schemas/AppBenchmarkConfig"
-                    }
-                }
-            },
             "LLMAsJudgeScoringFnParams": {
                 "type": "object",
                 "properties": {
@@ -5108,6 +5261,54 @@
                     "aggregated_results"
                 ]
             },
+            "EvaluateRowsDeprecatedRequest": {
+                "type": "object",
+                "properties": {
+                    "input_rows": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
+                            }
+                        }
+                    },
+                    "scoring_functions": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    },
+                    "task_config": {
+                        "$ref": "#/components/schemas/BenchmarkConfig"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "input_rows",
+                    "scoring_functions",
+                    "task_config"
+                ]
+            },
             "Session": {
                 "type": "object",
                 "properties": {
@@ -7304,22 +7505,22 @@
                     "data"
                 ]
             },
-            "RegisterDatasetRequest": {
+            "RegisterBenchmarkRequest": {
                 "type": "object",
                 "properties": {
+                    "benchmark_id": {
+                        "type": "string"
+                    },
                     "dataset_id": {
                         "type": "string"
                     },
-                    "dataset_schema": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ParamType"
+                    "scoring_functions": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
                         }
                     },
-                    "url": {
-                        "$ref": "#/components/schemas/URL"
-                    },
-                    "provider_dataset_id": {
+                    "provider_benchmark_id": {
                         "type": "string"
                     },
                     "provider_id": {
@@ -7353,27 +7554,27 @@
                 },
                 "additionalProperties": false,
                 "required": [
+                    "benchmark_id",
                     "dataset_id",
-                    "dataset_schema",
-                    "url"
+                    "scoring_functions"
                 ]
             },
-            "RegisterBenchmarkRequest": {
+            "RegisterDatasetRequest": {
                 "type": "object",
                 "properties": {
-                    "benchmark_id": {
-                        "type": "string"
-                    },
                     "dataset_id": {
                         "type": "string"
                     },
-                    "scoring_functions": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
+                    "dataset_schema": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/ParamType"
                         }
                     },
-                    "provider_benchmark_id": {
+                    "url": {
+                        "$ref": "#/components/schemas/URL"
+                    },
+                    "provider_dataset_id": {
                         "type": "string"
                     },
                     "provider_id": {
@@ -7407,9 +7608,9 @@
                 },
                 "additionalProperties": false,
                 "required": [
-                    "benchmark_id",
                     "dataset_id",
-                    "scoring_functions"
+                    "dataset_schema",
+                    "url"
                 ]
             },
             "RegisterModelRequest": {
@@ -7623,6 +7824,18 @@
                     "job_id"
                 ]
             },
+            "RunEvalDeprecatedRequest": {
+                "type": "object",
+                "properties": {
+                    "task_config": {
+                        "$ref": "#/components/schemas/BenchmarkConfig"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "task_config"
+                ]
+            },
             "RunShieldRequest": {
                 "type": "object",
                 "properties": {
@@ -8105,6 +8318,9 @@
         {
             "name": "BatchInference (Coming Soon)"
         },
+        {
+            "name": "Benchmarks"
+        },
         {
             "name": "DatasetIO"
         },
@@ -8114,9 +8330,6 @@
         {
             "name": "Eval"
         },
-        {
-            "name": "Benchmarks"
-        },
         {
             "name": "Inference",
             "description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
@@ -8168,10 +8381,10 @@
             "tags": [
                 "Agents",
                 "BatchInference (Coming Soon)",
+                "Benchmarks",
                 "DatasetIO",
                 "Datasets",
                 "Eval",
-                "Benchmarks",
                 "Inference",
                 "Inspect",
                 "Models",
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 0f0a613a81..89e0669177 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -25,8 +25,8 @@ paths:
         - Benchmarks
       description: ''
       parameters:
-        - name: benchmark_id
-          in: path
+        - name: task_id
+          in: query
           required: true
           schema:
             type: string
@@ -57,7 +57,7 @@ paths:
         content:
           application/json:
             schema:
-              $ref: '#/components/schemas/DeprecatedRegisterBenchmarkRequest'
+              $ref: '#/components/schemas/DeprecatedRegisterEvalTaskRequest'
         required: true
       deprecated: true
   /v1/datasetio/rows:
@@ -372,7 +372,7 @@ paths:
             schema:
               $ref: '#/components/schemas/EmbeddingsRequest'
         required: true
-  /v1/eval/tasks/{benchmark_id}/evaluations:
+  /v1/eval/benchmarks/{benchmark_id}/evaluations:
     post:
       responses:
         '200':
@@ -396,6 +396,31 @@ paths:
             schema:
               $ref: '#/components/schemas/EvaluateRowsRequest'
         required: true
+  /v1/eval/tasks/{task_id}/evaluations:
+    post:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/EvaluateResponse'
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/EvaluateRowsDeprecatedRequest'
+        required: true
+      deprecated: true
   /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}:
     get:
       responses:
@@ -457,7 +482,7 @@ paths:
           required: true
           schema:
             type: string
-  /v1/datasets/{dataset_id}:
+  /v1/eval/benchmarks/{benchmark_id}:
     get:
       responses:
         '200':
@@ -466,21 +491,28 @@ paths:
             application/json:
               schema:
                 oneOf:
-                  - $ref: '#/components/schemas/Dataset'
+                  - $ref: '#/components/schemas/Benchmark'
                   - type: 'null'
       tags:
-        - Datasets
+        - Benchmarks
       description: ''
       parameters:
-        - name: dataset_id
+        - name: benchmark_id
           in: path
           required: true
           schema:
             type: string
-    delete:
+  /v1/datasets/{dataset_id}:
+    get:
       responses:
         '200':
           description: OK
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: '#/components/schemas/Dataset'
+                  - type: 'null'
       tags:
         - Datasets
       description: ''
@@ -490,22 +522,15 @@ paths:
           required: true
           schema:
             type: string
-  /v1/eval/tasks/{benchmark_id}:
-    get:
+    delete:
       responses:
         '200':
           description: OK
-          content:
-            application/json:
-              schema:
-                oneOf:
-                  - $ref: '#/components/schemas/Benchmark'
-                  - type: 'null'
       tags:
-        - Benchmarks
+        - Datasets
       description: ''
       parameters:
-        - name: benchmark_id
+        - name: dataset_id
           in: path
           required: true
           schema:
@@ -852,7 +877,7 @@ paths:
             schema:
               $ref: '#/components/schemas/InvokeToolRequest'
         required: true
-  /v1/eval/tasks/{benchmark_id}/jobs/{job_id}:
+  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
     get:
       responses:
         '200':
@@ -895,7 +920,7 @@ paths:
           required: true
           schema:
             type: string
-  /v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result:
+  /v1/eval/tasks/{task_id}/jobs/{job_id}:
     get:
       responses:
         '200':
@@ -903,22 +928,67 @@ paths:
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/EvaluateResponse'
+                oneOf:
+                  - $ref: '#/components/schemas/JobStatus'
+                  - type: 'null'
       tags:
         - Eval
       description: ''
       parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
         - name: job_id
           in: path
           required: true
           schema:
             type: string
+      deprecated: true
+    delete:
+      responses:
+        '200':
+          description: OK
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+      deprecated: true
+  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/EvaluateResponse'
+      tags:
+        - Eval
+      description: ''
+      parameters:
         - name: benchmark_id
           in: path
           required: true
           schema:
             type: string
-  /v1/datasets:
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+  /v1/eval/tasks/{task_id}/jobs/{job_id}/result:
     get:
       responses:
         '200':
@@ -926,9 +996,33 @@ paths:
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/ListDatasetsResponse'
+                $ref: '#/components/schemas/EvaluateResponse'
       tags:
-        - Datasets
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+      deprecated: true
+  /v1/eval/benchmarks:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ListBenchmarksResponse'
+      tags:
+        - Benchmarks
       description: ''
       parameters: []
     post:
@@ -936,16 +1030,16 @@ paths:
         '200':
           description: OK
       tags:
-        - Datasets
+        - Benchmarks
       description: ''
       parameters: []
       requestBody:
         content:
           application/json:
             schema:
-              $ref: '#/components/schemas/RegisterDatasetRequest'
+              $ref: '#/components/schemas/RegisterBenchmarkRequest'
         required: true
-  /v1/eval/tasks:
+  /v1/datasets:
     get:
       responses:
         '200':
@@ -953,9 +1047,9 @@ paths:
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/ListBenchmarksResponse'
+                $ref: '#/components/schemas/ListDatasetsResponse'
       tags:
-        - Benchmarks
+        - Datasets
       description: ''
       parameters: []
     post:
@@ -963,14 +1057,14 @@ paths:
         '200':
           description: OK
       tags:
-        - Benchmarks
+        - Datasets
       description: ''
       parameters: []
       requestBody:
         content:
           application/json:
             schema:
-              $ref: '#/components/schemas/RegisterBenchmarkRequest'
+              $ref: '#/components/schemas/RegisterDatasetRequest'
         required: true
   /v1/models:
     get:
@@ -1328,7 +1422,7 @@ paths:
             type: array
             items:
               type: string
-  /v1/eval/tasks/{benchmark_id}/jobs:
+  /v1/eval/benchmarks/{benchmark_id}/jobs:
     post:
       responses:
         '200':
@@ -1352,6 +1446,31 @@ paths:
             schema:
               $ref: '#/components/schemas/RunEvalRequest'
         required: true
+  /v1/eval/tasks/{task_id}/jobs:
+    post:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Job'
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RunEvalDeprecatedRequest'
+        required: true
+      deprecated: true
   /v1/safety/run-shield:
     post:
       responses:
@@ -1527,10 +1646,10 @@ components:
       additionalProperties: false
       required:
         - data
-    DeprecatedRegisterBenchmarkRequest:
+    DeprecatedRegisterEvalTaskRequest:
       type: object
       properties:
-        benchmark_id:
+        task_id:
           type: string
         dataset_id:
           type: string
@@ -1554,7 +1673,7 @@ components:
               - type: object
       additionalProperties: false
       required:
-        - benchmark_id
+        - task_id
         - dataset_id
         - scoring_functions
     AppendRowsRequest:
@@ -3063,26 +3182,6 @@ components:
         - median
         - categorical_count
         - accuracy
-    AppBenchmarkConfig:
-      type: object
-      properties:
-        type:
-          type: string
-          const: app
-          default: app
-        eval_candidate:
-          $ref: '#/components/schemas/EvalCandidate'
-        scoring_params:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringFnParams'
-        num_examples:
-          type: integer
-      additionalProperties: false
-      required:
-        - type
-        - eval_candidate
-        - scoring_params
     BasicScoringFnParams:
       type: object
       properties:
@@ -3097,21 +3196,21 @@ components:
       additionalProperties: false
       required:
         - type
-    BenchmarkBenchmarkConfig:
+    BenchmarkConfig:
       type: object
       properties:
-        type:
-          type: string
-          const: benchmark
-          default: benchmark
         eval_candidate:
           $ref: '#/components/schemas/EvalCandidate'
+        scoring_params:
+          type: object
+          additionalProperties:
+            $ref: '#/components/schemas/ScoringFnParams'
         num_examples:
           type: integer
       additionalProperties: false
       required:
-        - type
         - eval_candidate
+        - scoring_params
     EvalCandidate:
       oneOf:
         - $ref: '#/components/schemas/ModelCandidate'
@@ -3121,15 +3220,6 @@ components:
         mapping:
           model: '#/components/schemas/ModelCandidate'
           agent: '#/components/schemas/AgentCandidate'
-    BenchmarkConfig:
-      oneOf:
-        - $ref: '#/components/schemas/BenchmarkBenchmarkConfig'
-        - $ref: '#/components/schemas/AppBenchmarkConfig'
-      discriminator:
-        propertyName: type
-        mapping:
-          benchmark: '#/components/schemas/BenchmarkBenchmarkConfig'
-          app: '#/components/schemas/AppBenchmarkConfig'
     LLMAsJudgeScoringFnParams:
       type: object
       properties:
@@ -3278,6 +3368,32 @@ components:
       required:
         - score_rows
         - aggregated_results
+    EvaluateRowsDeprecatedRequest:
+      type: object
+      properties:
+        input_rows:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+        scoring_functions:
+          type: array
+          items:
+            type: string
+        task_config:
+          $ref: '#/components/schemas/BenchmarkConfig'
+      additionalProperties: false
+      required:
+        - input_rows
+        - scoring_functions
+        - task_config
     Session:
       type: object
       properties:
@@ -4645,18 +4761,18 @@ components:
       additionalProperties: false
       required:
         - data
-    RegisterDatasetRequest:
+    RegisterBenchmarkRequest:
       type: object
       properties:
+        benchmark_id:
+          type: string
         dataset_id:
           type: string
-        dataset_schema:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ParamType'
-        url:
-          $ref: '#/components/schemas/URL'
-        provider_dataset_id:
+        scoring_functions:
+          type: array
+          items:
+            type: string
+        provider_benchmark_id:
           type: string
         provider_id:
           type: string
@@ -4672,21 +4788,21 @@ components:
               - type: object
       additionalProperties: false
       required:
+        - benchmark_id
         - dataset_id
-        - dataset_schema
-        - url
-    RegisterBenchmarkRequest:
+        - scoring_functions
+    RegisterDatasetRequest:
       type: object
       properties:
-        benchmark_id:
-          type: string
         dataset_id:
           type: string
-        scoring_functions:
-          type: array
-          items:
-            type: string
-        provider_benchmark_id:
+        dataset_schema:
+          type: object
+          additionalProperties:
+            $ref: '#/components/schemas/ParamType'
+        url:
+          $ref: '#/components/schemas/URL'
+        provider_dataset_id:
           type: string
         provider_id:
           type: string
@@ -4702,9 +4818,9 @@ components:
               - type: object
       additionalProperties: false
       required:
-        - benchmark_id
         - dataset_id
-        - scoring_functions
+        - dataset_schema
+        - url
     RegisterModelRequest:
       type: object
       properties:
@@ -4827,6 +4943,14 @@ components:
       additionalProperties: false
       required:
         - job_id
+    RunEvalDeprecatedRequest:
+      type: object
+      properties:
+        task_config:
+          $ref: '#/components/schemas/BenchmarkConfig'
+      additionalProperties: false
+      required:
+        - task_config
     RunShieldRequest:
       type: object
       properties:
@@ -5125,10 +5249,10 @@ tags:
     x-displayName: >-
       Agents API for creating and interacting with agentic systems.
   - name: BatchInference (Coming Soon)
+  - name: Benchmarks
   - name: DatasetIO
   - name: Datasets
   - name: Eval
-  - name: Benchmarks
   - name: Inference
     description: >-
       This API provides the raw interface to the underlying models. Two kinds of models
@@ -5159,10 +5283,10 @@ x-tagGroups:
     tags:
       - Agents
       - BatchInference (Coming Soon)
+      - Benchmarks
       - DatasetIO
       - Datasets
       - Eval
-      - Benchmarks
       - Inference
       - Inspect
       - Models
diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py
index 90b14131f4..b805e49762 100644
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@@ -83,3 +83,28 @@ async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ...
 
     @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
     async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ...
+
+    @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
+    async def run_eval_DEPRECATED(
+        self,
+        task_id: str,
+        task_config: BenchmarkConfig,
+    ) -> Job: ...
+
+    @webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST")
+    async def evaluate_rows_DEPRECATED(
+        self,
+        task_id: str,
+        input_rows: List[Dict[str, Any]],
+        scoring_functions: List[str],
+        task_config: BenchmarkConfig,
+    ) -> EvaluateResponse: ...
+
+    @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET")
+    async def job_status_DEPRECATED(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
+
+    @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE")
+    async def job_cancel_DEPRECATED(self, task_id: str, job_id: str) -> None: ...
+
+    @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET")
+    async def job_result_DEPRECATED(self, task_id: str, job_id: str) -> EvaluateResponse: ...
diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py
index f9f3067670..9945ad367b 100644
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@@ -9,7 +9,6 @@
 from llama_stack.apis.common.content_types import InterleavedContent, URL
 from llama_stack.apis.datasetio import DatasetIO, PaginatedRowsResult
 from llama_stack.apis.eval import (
-    AppBenchmarkConfig,
     BenchmarkConfig,
     Eval,
     EvaluateResponse,
@@ -348,7 +347,7 @@ async def shutdown(self) -> None:
     async def run_eval(
         self,
         benchmark_id: str,
-        task_config: AppBenchmarkConfig,
+        task_config: BenchmarkConfig,
     ) -> Job:
         return await self.routing_table.get_provider_impl(benchmark_id).run_eval(
             benchmark_id=benchmark_id,

From 2d0f6865ac1aaba44ada97caed89b4daaa79e1d4 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Wed, 12 Feb 2025 21:05:39 -0800
Subject: [PATCH 13/31] fix

---
 docs/_static/llama-stack-spec.html            | 3616 ++++++++---------
 docs/_static/llama-stack-spec.yaml            | 1476 +++----
 .../Llama_Stack_Benchmark_Evals.ipynb         |    4 +-
 llama_stack/apis/eval/eval.py                 |   10 +-
 llama_stack/distribution/routers/routers.py   |   42 +
 .../distribution/routers/routing_tables.py    |   12 +-
 .../inline/eval/meta_reference/eval.py        |   42 +
 7 files changed, 2643 insertions(+), 2559 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 652dae562c..381f37f1f0 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -40,6 +40,47 @@
         }
     ],
     "paths": {
+        "/v1/eval/tasks/{task_id}/evaluations": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/EvaluateResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/DeprecatedEvaluateRowsRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                },
+                "deprecated": true
+            }
+        },
         "/v1/eval-tasks/{benchmark_id}": {
             "get": {
                 "responses": {
@@ -78,6 +119,121 @@
                 "deprecated": true
             }
         },
+        "/v1/eval/tasks/{task_id}/jobs/{job_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/JobStatus"
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "job_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "deprecated": true
+            },
+            "delete": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "job_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "deprecated": true
+            }
+        },
+        "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/EvaluateResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "job_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "deprecated": true
+            }
+        },
         "/v1/eval-tasks": {
             "get": {
                 "responses": {
@@ -123,6 +279,47 @@
                 "deprecated": true
             }
         },
+        "/v1/eval/tasks/{task_id}/jobs": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Job"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/DeprecatedRunEvalRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                },
+                "deprecated": true
+            }
+        },
         "/v1/datasetio/rows": {
             "get": {
                 "responses": {
@@ -653,47 +850,6 @@
                 }
             }
         },
-        "/v1/eval/tasks/{task_id}/evaluations": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/EvaluateResponse"
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "Eval"
-                ],
-                "description": "",
-                "parameters": [
-                    {
-                        "name": "task_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/EvaluateRowsDeprecatedRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                },
-                "deprecated": true
-            }
-        },
         "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}": {
             "get": {
                 "responses": {
@@ -1546,7 +1702,7 @@
                 ]
             }
         },
-        "/v1/eval/tasks/{task_id}/jobs/{job_id}": {
+        "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": {
             "get": {
                 "responses": {
                     "200": {
@@ -1554,14 +1710,7 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/JobStatus"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
+                                    "$ref": "#/components/schemas/EvaluateResponse"
                                 }
                             }
                         }
@@ -1573,76 +1722,7 @@
                 "description": "",
                 "parameters": [
                     {
-                        "name": "task_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "job_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ],
-                "deprecated": true
-            },
-            "delete": {
-                "responses": {
-                    "200": {
-                        "description": "OK"
-                    }
-                },
-                "tags": [
-                    "Eval"
-                ],
-                "description": "",
-                "parameters": [
-                    {
-                        "name": "task_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "job_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ],
-                "deprecated": true
-            }
-        },
-        "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/EvaluateResponse"
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "Eval"
-                ],
-                "description": "",
-                "parameters": [
-                    {
-                        "name": "benchmark_id",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -1660,45 +1740,6 @@
                 ]
             }
         },
-        "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/EvaluateResponse"
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "Eval"
-                ],
-                "description": "",
-                "parameters": [
-                    {
-                        "name": "task_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "job_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ],
-                "deprecated": true
-            }
-        },
         "/v1/eval/benchmarks": {
             "get": {
                 "responses": {
@@ -2400,47 +2441,6 @@
                 }
             }
         },
-        "/v1/eval/tasks/{task_id}/jobs": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/Job"
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "Eval"
-                ],
-                "description": "",
-                "parameters": [
-                    {
-                        "name": "task_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/RunEvalDeprecatedRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                },
-                "deprecated": true
-            }
-        },
         "/v1/safety/run-shield": {
             "post": {
                 "responses": {
@@ -2645,216 +2645,211 @@
     "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema",
     "components": {
         "schemas": {
-            "Benchmark": {
+            "AgentCandidate": {
                 "type": "object",
                 "properties": {
-                    "identifier": {
-                        "type": "string"
-                    },
-                    "provider_resource_id": {
-                        "type": "string"
-                    },
-                    "provider_id": {
-                        "type": "string"
-                    },
                     "type": {
                         "type": "string",
-                        "const": "benchmark",
-                        "default": "benchmark"
-                    },
-                    "dataset_id": {
-                        "type": "string"
-                    },
-                    "scoring_functions": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        }
+                        "const": "agent",
+                        "default": "agent"
                     },
-                    "metadata": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        }
+                    "config": {
+                        "$ref": "#/components/schemas/AgentConfig"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "identifier",
-                    "provider_resource_id",
-                    "provider_id",
                     "type",
-                    "dataset_id",
-                    "scoring_functions",
-                    "metadata"
+                    "config"
                 ]
             },
-            "ListBenchmarksResponse": {
+            "AgentConfig": {
                 "type": "object",
                 "properties": {
-                    "data": {
+                    "sampling_params": {
+                        "$ref": "#/components/schemas/SamplingParams"
+                    },
+                    "input_shields": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/Benchmark"
+                            "type": "string"
                         }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "data"
-                ]
-            },
-            "DeprecatedRegisterEvalTaskRequest": {
-                "type": "object",
-                "properties": {
-                    "task_id": {
-                        "type": "string"
-                    },
-                    "dataset_id": {
-                        "type": "string"
                     },
-                    "scoring_functions": {
+                    "output_shields": {
                         "type": "array",
                         "items": {
                             "type": "string"
                         }
                     },
-                    "provider_benchmark_id": {
-                        "type": "string"
+                    "toolgroups": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/AgentTool"
+                        }
                     },
-                    "provider_id": {
-                        "type": "string"
+                    "client_tools": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/ToolDef"
+                        }
                     },
-                    "metadata": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        }
+                    "tool_choice": {
+                        "type": "string",
+                        "enum": [
+                            "auto",
+                            "required"
+                        ],
+                        "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model.",
+                        "default": "auto"
+                    },
+                    "tool_prompt_format": {
+                        "type": "string",
+                        "enum": [
+                            "json",
+                            "function_tag",
+                            "python_list"
+                        ],
+                        "description": "Prompt format for calling custom / zero shot tools."
+                    },
+                    "tool_config": {
+                        "$ref": "#/components/schemas/ToolConfig"
+                    },
+                    "max_infer_iters": {
+                        "type": "integer",
+                        "default": 10
+                    },
+                    "model": {
+                        "type": "string"
+                    },
+                    "instructions": {
+                        "type": "string"
+                    },
+                    "enable_session_persistence": {
+                        "type": "boolean"
+                    },
+                    "response_format": {
+                        "$ref": "#/components/schemas/ResponseFormat"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "task_id",
-                    "dataset_id",
-                    "scoring_functions"
+                    "model",
+                    "instructions",
+                    "enable_session_persistence"
                 ]
             },
-            "AppendRowsRequest": {
+            "AgentTool": {
+                "oneOf": [
+                    {
+                        "type": "string"
+                    },
+                    {
+                        "type": "object",
+                        "properties": {
+                            "name": {
+                                "type": "string"
+                            },
+                            "args": {
+                                "type": "object",
+                                "additionalProperties": {
+                                    "oneOf": [
+                                        {
+                                            "type": "null"
+                                        },
+                                        {
+                                            "type": "boolean"
+                                        },
+                                        {
+                                            "type": "number"
+                                        },
+                                        {
+                                            "type": "string"
+                                        },
+                                        {
+                                            "type": "array"
+                                        },
+                                        {
+                                            "type": "object"
+                                        }
+                                    ]
+                                }
+                            }
+                        },
+                        "additionalProperties": false,
+                        "required": [
+                            "name",
+                            "args"
+                        ]
+                    }
+                ]
+            },
+            "AggregationFunctionType": {
+                "type": "string",
+                "enum": [
+                    "average",
+                    "median",
+                    "categorical_count",
+                    "accuracy"
+                ]
+            },
+            "BasicScoringFnParams": {
                 "type": "object",
                 "properties": {
-                    "dataset_id": {
-                        "type": "string"
+                    "type": {
+                        "type": "string",
+                        "const": "basic",
+                        "default": "basic"
                     },
-                    "rows": {
+                    "aggregation_functions": {
                         "type": "array",
                         "items": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "null"
-                                    },
-                                    {
-                                        "type": "boolean"
-                                    },
-                                    {
-                                        "type": "number"
-                                    },
-                                    {
-                                        "type": "string"
-                                    },
-                                    {
-                                        "type": "array"
-                                    },
-                                    {
-                                        "type": "object"
-                                    }
-                                ]
-                            }
+                            "$ref": "#/components/schemas/AggregationFunctionType"
                         }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "dataset_id",
-                    "rows"
+                    "type"
                 ]
             },
-            "CompletionMessage": {
+            "BenchmarkConfig": {
                 "type": "object",
                 "properties": {
-                    "role": {
-                        "type": "string",
-                        "const": "assistant",
-                        "default": "assistant",
-                        "description": "Must be \"assistant\" to identify this as the model's response"
-                    },
-                    "content": {
-                        "$ref": "#/components/schemas/InterleavedContent",
-                        "description": "The content of the model's response"
+                    "eval_candidate": {
+                        "$ref": "#/components/schemas/EvalCandidate"
                     },
-                    "stop_reason": {
-                        "type": "string",
-                        "enum": [
-                            "end_of_turn",
-                            "end_of_message",
-                            "out_of_tokens"
-                        ],
-                        "description": "Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: The model finished generating the entire response. - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response. - `StopReason.out_of_tokens`: The model ran out of token budget."
+                    "scoring_params": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/ScoringFnParams"
+                        }
                     },
-                    "tool_calls": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/ToolCall"
-                        },
-                        "description": "List of tool calls. Each tool call is a ToolCall object."
+                    "num_examples": {
+                        "type": "integer"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "role",
-                    "content",
-                    "stop_reason"
+                    "eval_candidate",
+                    "scoring_params"
+                ]
+            },
+            "EvalCandidate": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/ModelCandidate"
+                    },
+                    {
+                        "$ref": "#/components/schemas/AgentCandidate"
+                    }
                 ],
-                "description": "A message containing the model's (assistant) response in a chat conversation."
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "model": "#/components/schemas/ModelCandidate",
+                        "agent": "#/components/schemas/AgentCandidate"
+                    }
+                }
             },
             "GrammarResponseFormat": {
                 "type": "object",
@@ -3022,38 +3017,97 @@
                 ],
                 "description": "Configuration for JSON schema-guided response generation."
             },
-            "Message": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/UserMessage"
+            "LLMAsJudgeScoringFnParams": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "llm_as_judge",
+                        "default": "llm_as_judge"
                     },
-                    {
-                        "$ref": "#/components/schemas/SystemMessage"
+                    "judge_model": {
+                        "type": "string"
                     },
-                    {
-                        "$ref": "#/components/schemas/ToolResponseMessage"
+                    "prompt_template": {
+                        "type": "string"
                     },
-                    {
-                        "$ref": "#/components/schemas/CompletionMessage"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "role",
-                    "mapping": {
-                        "user": "#/components/schemas/UserMessage",
-                        "system": "#/components/schemas/SystemMessage",
-                        "tool": "#/components/schemas/ToolResponseMessage",
-                        "assistant": "#/components/schemas/CompletionMessage"
+                    "judge_score_regexes": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    },
+                    "aggregation_functions": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/AggregationFunctionType"
+                        }
                     }
-                }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "judge_model"
+                ]
             },
-            "ResponseFormat": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/JsonSchemaResponseFormat"
+            "ModelCandidate": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "model",
+                        "default": "model"
                     },
-                    {
-                        "$ref": "#/components/schemas/GrammarResponseFormat"
+                    "model": {
+                        "type": "string"
+                    },
+                    "sampling_params": {
+                        "$ref": "#/components/schemas/SamplingParams"
+                    },
+                    "system_message": {
+                        "$ref": "#/components/schemas/SystemMessage"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "model",
+                    "sampling_params"
+                ]
+            },
+            "RegexParserScoringFnParams": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "regex_parser",
+                        "default": "regex_parser"
+                    },
+                    "parsing_regexes": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    },
+                    "aggregation_functions": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/AggregationFunctionType"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ]
+            },
+            "ResponseFormat": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/JsonSchemaResponseFormat"
+                    },
+                    {
+                        "$ref": "#/components/schemas/GrammarResponseFormat"
                     }
                 ],
                 "discriminator": {
@@ -3105,6 +3159,27 @@
                     }
                 }
             },
+            "ScoringFnParams": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/LLMAsJudgeScoringFnParams"
+                    },
+                    {
+                        "$ref": "#/components/schemas/RegexParserScoringFnParams"
+                    },
+                    {
+                        "$ref": "#/components/schemas/BasicScoringFnParams"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams",
+                        "regex_parser": "#/components/schemas/RegexParserScoringFnParams",
+                        "basic": "#/components/schemas/BasicScoringFnParams"
+                    }
+                }
+            },
             "SystemMessage": {
                 "type": "object",
                 "properties": {
@@ -3147,90 +3222,79 @@
                 ],
                 "description": "A text content item"
             },
-            "ToolCall": {
+            "ToolConfig": {
                 "type": "object",
                 "properties": {
-                    "call_id": {
+                    "tool_choice": {
+                        "type": "string",
+                        "enum": [
+                            "auto",
+                            "required"
+                        ],
+                        "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.",
+                        "default": "auto"
+                    },
+                    "tool_prompt_format": {
+                        "type": "string",
+                        "enum": [
+                            "json",
+                            "function_tag",
+                            "python_list"
+                        ],
+                        "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls."
+                    },
+                    "system_message_behavior": {
+                        "type": "string",
+                        "enum": [
+                            "append",
+                            "replace"
+                        ],
+                        "description": "(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string '{{function_definitions}}' to indicate where the function definitions should be inserted.",
+                        "default": "append"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "system_message_behavior"
+                ],
+                "description": "Configuration for tool use."
+            },
+            "ToolDef": {
+                "type": "object",
+                "properties": {
+                    "name": {
                         "type": "string"
                     },
-                    "tool_name": {
-                        "oneOf": [
-                            {
-                                "type": "string",
-                                "enum": [
-                                    "brave_search",
-                                    "wolfram_alpha",
-                                    "photogen",
-                                    "code_interpreter"
-                                ]
-                            },
-                            {
-                                "type": "string"
-                            }
-                        ]
+                    "description": {
+                        "type": "string"
                     },
-                    "arguments": {
+                    "parameters": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/ToolParameter"
+                        }
+                    },
+                    "metadata": {
                         "type": "object",
                         "additionalProperties": {
                             "oneOf": [
                                 {
-                                    "type": "string"
+                                    "type": "null"
                                 },
                                 {
-                                    "type": "integer"
+                                    "type": "boolean"
                                 },
                                 {
                                     "type": "number"
                                 },
                                 {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "null"
+                                    "type": "string"
                                 },
                                 {
-                                    "type": "array",
-                                    "items": {
-                                        "oneOf": [
-                                            {
-                                                "type": "string"
-                                            },
-                                            {
-                                                "type": "integer"
-                                            },
-                                            {
-                                                "type": "number"
-                                            },
-                                            {
-                                                "type": "boolean"
-                                            },
-                                            {
-                                                "type": "null"
-                                            }
-                                        ]
-                                    }
+                                    "type": "array"
                                 },
                                 {
-                                    "type": "object",
-                                    "additionalProperties": {
-                                        "oneOf": [
-                                            {
-                                                "type": "string"
-                                            },
-                                            {
-                                                "type": "integer"
-                                            },
-                                            {
-                                                "type": "number"
-                                            },
-                                            {
-                                                "type": "boolean"
-                                            },
-                                            {
-                                                "type": "null"
-                                            }
-                                        ]
-                                    }
+                                    "type": "object"
                                 }
                             ]
                         }
@@ -3238,49 +3302,16 @@
                 },
                 "additionalProperties": false,
                 "required": [
-                    "call_id",
-                    "tool_name",
-                    "arguments"
+                    "name"
                 ]
             },
-            "ToolDefinition": {
+            "ToolParameter": {
                 "type": "object",
                 "properties": {
-                    "tool_name": {
-                        "oneOf": [
-                            {
-                                "type": "string",
-                                "enum": [
-                                    "brave_search",
-                                    "wolfram_alpha",
-                                    "photogen",
-                                    "code_interpreter"
-                                ]
-                            },
-                            {
-                                "type": "string"
-                            }
-                        ]
-                    },
-                    "description": {
+                    "name": {
                         "type": "string"
                     },
-                    "parameters": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ToolParamDefinition"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "tool_name"
-                ]
-            },
-            "ToolParamDefinition": {
-                "type": "object",
-                "properties": {
-                    "param_type": {
+                    "parameter_type": {
                         "type": "string"
                     },
                     "description": {
@@ -3315,60 +3346,19 @@
                 },
                 "additionalProperties": false,
                 "required": [
-                    "param_type"
+                    "name",
+                    "parameter_type",
+                    "description",
+                    "required"
                 ]
             },
-            "ToolResponseMessage": {
+            "TopKSamplingStrategy": {
                 "type": "object",
                 "properties": {
-                    "role": {
-                        "type": "string",
-                        "const": "tool",
-                        "default": "tool",
-                        "description": "Must be \"tool\" to identify this as a tool response"
-                    },
-                    "call_id": {
+                    "type": {
                         "type": "string",
-                        "description": "Unique identifier for the tool call this response is for"
-                    },
-                    "tool_name": {
-                        "oneOf": [
-                            {
-                                "type": "string",
-                                "enum": [
-                                    "brave_search",
-                                    "wolfram_alpha",
-                                    "photogen",
-                                    "code_interpreter"
-                                ]
-                            },
-                            {
-                                "type": "string"
-                            }
-                        ],
-                        "description": "Name of the tool that was called"
-                    },
-                    "content": {
-                        "$ref": "#/components/schemas/InterleavedContent",
-                        "description": "The response content from the tool"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "role",
-                    "call_id",
-                    "tool_name",
-                    "content"
-                ],
-                "description": "A message representing the result of a tool invocation."
-            },
-            "TopKSamplingStrategy": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "top_k",
-                        "default": "top_k"
+                        "const": "top_k",
+                        "default": "top_k"
                     },
                     "top_k": {
                         "type": "integer"
@@ -3413,597 +3403,692 @@
                     "uri"
                 ]
             },
-            "UserMessage": {
-                "type": "object",
-                "properties": {
-                    "role": {
-                        "type": "string",
-                        "const": "user",
-                        "default": "user",
-                        "description": "Must be \"user\" to identify this as a user message"
-                    },
-                    "content": {
-                        "$ref": "#/components/schemas/InterleavedContent",
-                        "description": "The content of the message, which can include text and other media"
-                    },
-                    "context": {
-                        "$ref": "#/components/schemas/InterleavedContent",
-                        "description": "(Optional) This field is used internally by Llama Stack to pass RAG context. This field may be removed in the API in the future."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "role",
-                    "content"
-                ],
-                "description": "A message from the user in a chat conversation."
-            },
-            "BatchChatCompletionRequest": {
+            "DeprecatedEvaluateRowsRequest": {
                 "type": "object",
                 "properties": {
-                    "model": {
-                        "type": "string"
-                    },
-                    "messages_batch": {
+                    "input_rows": {
                         "type": "array",
                         "items": {
-                            "type": "array",
-                            "items": {
-                                "$ref": "#/components/schemas/Message"
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
                             }
                         }
                     },
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams"
-                    },
-                    "tools": {
+                    "scoring_functions": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/ToolDefinition"
+                            "type": "string"
                         }
                     },
-                    "tool_choice": {
-                        "type": "string",
-                        "enum": [
-                            "auto",
-                            "required"
-                        ],
-                        "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
-                    },
-                    "tool_prompt_format": {
-                        "type": "string",
-                        "enum": [
-                            "json",
-                            "function_tag",
-                            "python_list"
-                        ],
-                        "description": "Prompt format for calling custom / zero shot tools."
-                    },
-                    "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat"
-                    },
-                    "logprobs": {
-                        "type": "object",
-                        "properties": {
-                            "top_k": {
-                                "type": "integer",
-                                "default": 0,
-                                "description": "How many tokens (for each position) to return log probabilities for."
-                            }
-                        },
-                        "additionalProperties": false
+                    "task_config": {
+                        "$ref": "#/components/schemas/BenchmarkConfig"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "model",
-                    "messages_batch"
+                    "input_rows",
+                    "scoring_functions",
+                    "task_config"
                 ]
             },
-            "BatchChatCompletionResponse": {
+            "EvaluateResponse": {
                 "type": "object",
                 "properties": {
-                    "batch": {
+                    "generations": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/ChatCompletionResponse"
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
+                            }
+                        }
+                    },
+                    "scores": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/ScoringResult"
                         }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "batch"
+                    "generations",
+                    "scores"
                 ]
             },
-            "ChatCompletionResponse": {
+            "ScoringResult": {
                 "type": "object",
                 "properties": {
-                    "completion_message": {
-                        "$ref": "#/components/schemas/CompletionMessage",
-                        "description": "The complete response message"
-                    },
-                    "logprobs": {
+                    "score_rows": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/TokenLogProbs"
-                        },
-                        "description": "Optional log probabilities for generated tokens"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "completion_message"
-                ],
-                "description": "Response from a chat completion request."
-            },
-            "TokenLogProbs": {
-                "type": "object",
-                "properties": {
-                    "logprobs_by_token": {
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
+                            }
+                        }
+                    },
+                    "aggregated_results": {
                         "type": "object",
                         "additionalProperties": {
-                            "type": "number"
-                        },
-                        "description": "Dictionary mapping tokens to their log probabilities"
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "logprobs_by_token"
-                ],
-                "description": "Log probabilities for generated tokens."
+                    "score_rows",
+                    "aggregated_results"
+                ]
             },
-            "BatchCompletionRequest": {
+            "Benchmark": {
                 "type": "object",
                 "properties": {
-                    "model": {
+                    "identifier": {
                         "type": "string"
                     },
-                    "content_batch": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/InterleavedContent"
-                        }
-                    },
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams"
+                    "provider_resource_id": {
+                        "type": "string"
                     },
-                    "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat"
+                    "provider_id": {
+                        "type": "string"
                     },
-                    "logprobs": {
+                    "type": {
+                        "type": "string",
+                        "const": "benchmark",
+                        "default": "benchmark"
+                    },
+                    "dataset_id": {
+                        "type": "string"
+                    },
+                    "scoring_functions": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    },
+                    "metadata": {
                         "type": "object",
-                        "properties": {
-                            "top_k": {
-                                "type": "integer",
-                                "default": 0,
-                                "description": "How many tokens (for each position) to return log probabilities for."
-                            }
-                        },
-                        "additionalProperties": false
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "model",
-                    "content_batch"
+                    "identifier",
+                    "provider_resource_id",
+                    "provider_id",
+                    "type",
+                    "dataset_id",
+                    "scoring_functions",
+                    "metadata"
                 ]
             },
-            "BatchCompletionResponse": {
+            "JobStatus": {
+                "type": "string",
+                "enum": [
+                    "completed",
+                    "in_progress",
+                    "failed",
+                    "scheduled"
+                ]
+            },
+            "ListBenchmarksResponse": {
                 "type": "object",
                 "properties": {
-                    "batch": {
+                    "data": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/CompletionResponse"
+                            "$ref": "#/components/schemas/Benchmark"
                         }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "batch"
+                    "data"
                 ]
             },
-            "CompletionResponse": {
+            "DeprecatedRegisterEvalTaskRequest": {
                 "type": "object",
                 "properties": {
-                    "content": {
-                        "type": "string",
-                        "description": "The generated completion text"
+                    "task_id": {
+                        "type": "string"
                     },
-                    "stop_reason": {
-                        "type": "string",
-                        "enum": [
-                            "end_of_turn",
-                            "end_of_message",
-                            "out_of_tokens"
-                        ],
-                        "description": "Reason why generation stopped"
+                    "dataset_id": {
+                        "type": "string"
                     },
-                    "logprobs": {
+                    "scoring_functions": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/TokenLogProbs"
-                        },
-                        "description": "Optional log probabilities for generated tokens"
+                            "type": "string"
+                        }
+                    },
+                    "provider_benchmark_id": {
+                        "type": "string"
+                    },
+                    "provider_id": {
+                        "type": "string"
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "content",
-                    "stop_reason"
-                ],
-                "description": "Response from a completion request."
+                    "task_id",
+                    "dataset_id",
+                    "scoring_functions"
+                ]
             },
-            "CancelTrainingJobRequest": {
+            "DeprecatedRunEvalRequest": {
                 "type": "object",
                 "properties": {
-                    "job_uuid": {
+                    "task_config": {
+                        "$ref": "#/components/schemas/BenchmarkConfig"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "task_config"
+                ]
+            },
+            "Job": {
+                "type": "object",
+                "properties": {
+                    "job_id": {
                         "type": "string"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "job_uuid"
+                    "job_id"
                 ]
             },
-            "ToolConfig": {
+            "AppendRowsRequest": {
                 "type": "object",
                 "properties": {
-                    "tool_choice": {
-                        "type": "string",
-                        "enum": [
-                            "auto",
-                            "required"
-                        ],
-                        "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.",
-                        "default": "auto"
-                    },
-                    "tool_prompt_format": {
-                        "type": "string",
-                        "enum": [
-                            "json",
-                            "function_tag",
-                            "python_list"
-                        ],
-                        "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls."
+                    "dataset_id": {
+                        "type": "string"
                     },
-                    "system_message_behavior": {
-                        "type": "string",
-                        "enum": [
-                            "append",
-                            "replace"
-                        ],
-                        "description": "(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string '{{function_definitions}}' to indicate where the function definitions should be inserted.",
-                        "default": "append"
+                    "rows": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
+                            }
+                        }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "system_message_behavior"
-                ],
-                "description": "Configuration for tool use."
+                    "dataset_id",
+                    "rows"
+                ]
             },
-            "ChatCompletionRequest": {
+            "CompletionMessage": {
                 "type": "object",
                 "properties": {
-                    "model_id": {
+                    "role": {
                         "type": "string",
-                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
+                        "const": "assistant",
+                        "default": "assistant",
+                        "description": "Must be \"assistant\" to identify this as the model's response"
                     },
-                    "messages": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/Message"
-                        },
-                        "description": "List of messages in the conversation"
+                    "content": {
+                        "$ref": "#/components/schemas/InterleavedContent",
+                        "description": "The content of the model's response"
                     },
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams",
-                        "description": "Parameters to control the sampling strategy"
+                    "stop_reason": {
+                        "type": "string",
+                        "enum": [
+                            "end_of_turn",
+                            "end_of_message",
+                            "out_of_tokens"
+                        ],
+                        "description": "Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: The model finished generating the entire response. - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response. - `StopReason.out_of_tokens`: The model ran out of token budget."
                     },
-                    "tools": {
+                    "tool_calls": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/ToolDefinition"
+                            "$ref": "#/components/schemas/ToolCall"
                         },
-                        "description": "(Optional) List of tool definitions available to the model"
-                    },
-                    "tool_choice": {
-                        "type": "string",
-                        "enum": [
-                            "auto",
-                            "required"
-                        ],
-                        "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead."
-                    },
-                    "tool_prompt_format": {
-                        "type": "string",
-                        "enum": [
-                            "json",
-                            "function_tag",
-                            "python_list"
-                        ],
-                        "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls. .. deprecated:: Use tool_config instead."
-                    },
-                    "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat",
-                        "description": "(Optional) Grammar specification for guided (structured) decoding. There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it."
-                    },
-                    "stream": {
-                        "type": "boolean",
-                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
-                    },
-                    "logprobs": {
-                        "type": "object",
-                        "properties": {
-                            "top_k": {
-                                "type": "integer",
-                                "default": 0,
-                                "description": "How many tokens (for each position) to return log probabilities for."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
-                    },
-                    "tool_config": {
-                        "$ref": "#/components/schemas/ToolConfig",
-                        "description": "(Optional) Configuration for tool use."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "model_id",
-                    "messages"
-                ]
-            },
-            "ChatCompletionResponseEvent": {
-                "type": "object",
-                "properties": {
-                    "event_type": {
-                        "type": "string",
-                        "enum": [
-                            "start",
-                            "complete",
-                            "progress"
-                        ],
-                        "description": "Type of the event"
-                    },
-                    "delta": {
-                        "$ref": "#/components/schemas/ContentDelta",
-                        "description": "Content generated since last event. This can be one or more tokens, or a tool call."
-                    },
-                    "logprobs": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/TokenLogProbs"
-                        },
-                        "description": "Optional log probabilities for generated tokens"
-                    },
-                    "stop_reason": {
-                        "type": "string",
-                        "enum": [
-                            "end_of_turn",
-                            "end_of_message",
-                            "out_of_tokens"
-                        ],
-                        "description": "Optional reason why generation stopped, if complete"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "event_type",
-                    "delta"
-                ],
-                "description": "An event during chat completion generation."
-            },
-            "ChatCompletionResponseStreamChunk": {
-                "type": "object",
-                "properties": {
-                    "event": {
-                        "$ref": "#/components/schemas/ChatCompletionResponseEvent",
-                        "description": "The event containing the new content"
+                        "description": "List of tool calls. Each tool call is a ToolCall object."
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "event"
+                    "role",
+                    "content",
+                    "stop_reason"
                 ],
-                "description": "A chunk of a streamed chat completion response."
+                "description": "A message containing the model's (assistant) response in a chat conversation."
             },
-            "ContentDelta": {
+            "Message": {
                 "oneOf": [
                     {
-                        "$ref": "#/components/schemas/TextDelta"
+                        "$ref": "#/components/schemas/UserMessage"
                     },
                     {
-                        "$ref": "#/components/schemas/ImageDelta"
+                        "$ref": "#/components/schemas/SystemMessage"
                     },
                     {
-                        "$ref": "#/components/schemas/ToolCallDelta"
+                        "$ref": "#/components/schemas/ToolResponseMessage"
+                    },
+                    {
+                        "$ref": "#/components/schemas/CompletionMessage"
                     }
                 ],
                 "discriminator": {
-                    "propertyName": "type",
+                    "propertyName": "role",
                     "mapping": {
-                        "text": "#/components/schemas/TextDelta",
-                        "image": "#/components/schemas/ImageDelta",
-                        "tool_call": "#/components/schemas/ToolCallDelta"
+                        "user": "#/components/schemas/UserMessage",
+                        "system": "#/components/schemas/SystemMessage",
+                        "tool": "#/components/schemas/ToolResponseMessage",
+                        "assistant": "#/components/schemas/CompletionMessage"
                     }
                 }
             },
-            "ImageDelta": {
+            "ToolCall": {
                 "type": "object",
                 "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "image",
-                        "default": "image"
+                    "call_id": {
+                        "type": "string"
                     },
-                    "image": {
-                        "type": "string",
-                        "contentEncoding": "base64"
+                    "tool_name": {
+                        "oneOf": [
+                            {
+                                "type": "string",
+                                "enum": [
+                                    "brave_search",
+                                    "wolfram_alpha",
+                                    "photogen",
+                                    "code_interpreter"
+                                ]
+                            },
+                            {
+                                "type": "string"
+                            }
+                        ]
+                    },
+                    "arguments": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "integer"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "array",
+                                    "items": {
+                                        "oneOf": [
+                                            {
+                                                "type": "string"
+                                            },
+                                            {
+                                                "type": "integer"
+                                            },
+                                            {
+                                                "type": "number"
+                                            },
+                                            {
+                                                "type": "boolean"
+                                            },
+                                            {
+                                                "type": "null"
+                                            }
+                                        ]
+                                    }
+                                },
+                                {
+                                    "type": "object",
+                                    "additionalProperties": {
+                                        "oneOf": [
+                                            {
+                                                "type": "string"
+                                            },
+                                            {
+                                                "type": "integer"
+                                            },
+                                            {
+                                                "type": "number"
+                                            },
+                                            {
+                                                "type": "boolean"
+                                            },
+                                            {
+                                                "type": "null"
+                                            }
+                                        ]
+                                    }
+                                }
+                            ]
+                        }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "type",
-                    "image"
+                    "call_id",
+                    "tool_name",
+                    "arguments"
                 ]
             },
-            "TextDelta": {
+            "ToolDefinition": {
                 "type": "object",
                 "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "text",
-                        "default": "text"
+                    "tool_name": {
+                        "oneOf": [
+                            {
+                                "type": "string",
+                                "enum": [
+                                    "brave_search",
+                                    "wolfram_alpha",
+                                    "photogen",
+                                    "code_interpreter"
+                                ]
+                            },
+                            {
+                                "type": "string"
+                            }
+                        ]
                     },
-                    "text": {
+                    "description": {
                         "type": "string"
+                    },
+                    "parameters": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/ToolParamDefinition"
+                        }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "type",
-                    "text"
+                    "tool_name"
                 ]
             },
-            "ToolCallDelta": {
+            "ToolParamDefinition": {
                 "type": "object",
                 "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "tool_call",
-                        "default": "tool_call"
+                    "param_type": {
+                        "type": "string"
                     },
-                    "tool_call": {
+                    "description": {
+                        "type": "string"
+                    },
+                    "required": {
+                        "type": "boolean",
+                        "default": true
+                    },
+                    "default": {
                         "oneOf": [
+                            {
+                                "type": "null"
+                            },
+                            {
+                                "type": "boolean"
+                            },
+                            {
+                                "type": "number"
+                            },
                             {
                                 "type": "string"
                             },
                             {
-                                "$ref": "#/components/schemas/ToolCall"
+                                "type": "array"
+                            },
+                            {
+                                "type": "object"
                             }
                         ]
-                    },
-                    "parse_status": {
-                        "type": "string",
-                        "enum": [
-                            "started",
-                            "in_progress",
-                            "failed",
-                            "succeeded"
-                        ]
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "type",
-                    "tool_call",
-                    "parse_status"
+                    "param_type"
                 ]
             },
-            "CompletionRequest": {
+            "ToolResponseMessage": {
                 "type": "object",
                 "properties": {
-                    "model_id": {
+                    "role": {
                         "type": "string",
-                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
-                    },
-                    "content": {
-                        "$ref": "#/components/schemas/InterleavedContent",
-                        "description": "The content to generate a completion for"
-                    },
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams",
-                        "description": "(Optional) Parameters to control the sampling strategy"
-                    },
-                    "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat",
-                        "description": "(Optional) Grammar specification for guided (structured) decoding"
+                        "const": "tool",
+                        "default": "tool",
+                        "description": "Must be \"tool\" to identify this as a tool response"
                     },
-                    "stream": {
-                        "type": "boolean",
-                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
+                    "call_id": {
+                        "type": "string",
+                        "description": "Unique identifier for the tool call this response is for"
                     },
-                    "logprobs": {
-                        "type": "object",
-                        "properties": {
-                            "top_k": {
-                                "type": "integer",
-                                "default": 0,
-                                "description": "How many tokens (for each position) to return log probabilities for."
+                    "tool_name": {
+                        "oneOf": [
+                            {
+                                "type": "string",
+                                "enum": [
+                                    "brave_search",
+                                    "wolfram_alpha",
+                                    "photogen",
+                                    "code_interpreter"
+                                ]
+                            },
+                            {
+                                "type": "string"
                             }
-                        },
-                        "additionalProperties": false,
-                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
+                        ],
+                        "description": "Name of the tool that was called"
+                    },
+                    "content": {
+                        "$ref": "#/components/schemas/InterleavedContent",
+                        "description": "The response content from the tool"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "model_id",
+                    "role",
+                    "call_id",
+                    "tool_name",
                     "content"
-                ]
+                ],
+                "description": "A message representing the result of a tool invocation."
             },
-            "CompletionResponseStreamChunk": {
+            "UserMessage": {
                 "type": "object",
                 "properties": {
-                    "delta": {
+                    "role": {
                         "type": "string",
-                        "description": "New content generated since last chunk. This can be one or more tokens."
+                        "const": "user",
+                        "default": "user",
+                        "description": "Must be \"user\" to identify this as a user message"
                     },
-                    "stop_reason": {
-                        "type": "string",
-                        "enum": [
-                            "end_of_turn",
-                            "end_of_message",
-                            "out_of_tokens"
-                        ],
-                        "description": "Optional reason why generation stopped, if complete"
+                    "content": {
+                        "$ref": "#/components/schemas/InterleavedContent",
+                        "description": "The content of the message, which can include text and other media"
                     },
-                    "logprobs": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/TokenLogProbs"
-                        },
-                        "description": "Optional log probabilities for generated tokens"
+                    "context": {
+                        "$ref": "#/components/schemas/InterleavedContent",
+                        "description": "(Optional) This field is used internally by Llama Stack to pass RAG context. This field may be removed in the API in the future."
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "delta"
+                    "role",
+                    "content"
                 ],
-                "description": "A chunk of a streamed completion response."
+                "description": "A message from the user in a chat conversation."
             },
-            "AgentConfig": {
+            "BatchChatCompletionRequest": {
                 "type": "object",
                 "properties": {
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams"
-                    },
-                    "input_shields": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        }
+                    "model": {
+                        "type": "string"
                     },
-                    "output_shields": {
+                    "messages_batch": {
                         "type": "array",
                         "items": {
-                            "type": "string"
+                            "type": "array",
+                            "items": {
+                                "$ref": "#/components/schemas/Message"
+                            }
                         }
                     },
-                    "toolgroups": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/AgentTool"
-                        }
+                    "sampling_params": {
+                        "$ref": "#/components/schemas/SamplingParams"
                     },
-                    "client_tools": {
+                    "tools": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/ToolDef"
+                            "$ref": "#/components/schemas/ToolDefinition"
                         }
                     },
                     "tool_choice": {
@@ -4012,8 +4097,7 @@
                             "auto",
                             "required"
                         ],
-                        "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model.",
-                        "default": "auto"
+                        "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
                     },
                     "tool_prompt_format": {
                         "type": "string",
@@ -4024,521 +4108,518 @@
                         ],
                         "description": "Prompt format for calling custom / zero shot tools."
                     },
-                    "tool_config": {
-                        "$ref": "#/components/schemas/ToolConfig"
-                    },
-                    "max_infer_iters": {
-                        "type": "integer",
-                        "default": 10
-                    },
-                    "model": {
-                        "type": "string"
-                    },
-                    "instructions": {
-                        "type": "string"
-                    },
-                    "enable_session_persistence": {
-                        "type": "boolean"
-                    },
                     "response_format": {
                         "$ref": "#/components/schemas/ResponseFormat"
+                    },
+                    "logprobs": {
+                        "type": "object",
+                        "properties": {
+                            "top_k": {
+                                "type": "integer",
+                                "default": 0,
+                                "description": "How many tokens (for each position) to return log probabilities for."
+                            }
+                        },
+                        "additionalProperties": false
                     }
                 },
                 "additionalProperties": false,
                 "required": [
                     "model",
-                    "instructions",
-                    "enable_session_persistence"
+                    "messages_batch"
                 ]
             },
-            "AgentTool": {
-                "oneOf": [
-                    {
-                        "type": "string"
-                    },
-                    {
-                        "type": "object",
-                        "properties": {
-                            "name": {
-                                "type": "string"
-                            },
-                            "args": {
-                                "type": "object",
-                                "additionalProperties": {
-                                    "oneOf": [
-                                        {
-                                            "type": "null"
-                                        },
-                                        {
-                                            "type": "boolean"
-                                        },
-                                        {
-                                            "type": "number"
-                                        },
-                                        {
-                                            "type": "string"
-                                        },
-                                        {
-                                            "type": "array"
-                                        },
-                                        {
-                                            "type": "object"
-                                        }
-                                    ]
-                                }
-                            }
-                        },
-                        "additionalProperties": false,
-                        "required": [
-                            "name",
-                            "args"
-                        ]
+            "BatchChatCompletionResponse": {
+                "type": "object",
+                "properties": {
+                    "batch": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/ChatCompletionResponse"
+                        }
                     }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "batch"
                 ]
             },
-            "ToolDef": {
+            "ChatCompletionResponse": {
                 "type": "object",
                 "properties": {
-                    "name": {
-                        "type": "string"
-                    },
-                    "description": {
-                        "type": "string"
+                    "completion_message": {
+                        "$ref": "#/components/schemas/CompletionMessage",
+                        "description": "The complete response message"
                     },
-                    "parameters": {
+                    "logprobs": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/ToolParameter"
-                        }
-                    },
-                    "metadata": {
+                            "$ref": "#/components/schemas/TokenLogProbs"
+                        },
+                        "description": "Optional log probabilities for generated tokens"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "completion_message"
+                ],
+                "description": "Response from a chat completion request."
+            },
+            "TokenLogProbs": {
+                "type": "object",
+                "properties": {
+                    "logprobs_by_token": {
                         "type": "object",
                         "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        }
+                            "type": "number"
+                        },
+                        "description": "Dictionary mapping tokens to their log probabilities"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "name"
-                ]
+                    "logprobs_by_token"
+                ],
+                "description": "Log probabilities for generated tokens."
             },
-            "ToolParameter": {
+            "BatchCompletionRequest": {
                 "type": "object",
                 "properties": {
-                    "name": {
+                    "model": {
                         "type": "string"
                     },
-                    "parameter_type": {
-                        "type": "string"
+                    "content_batch": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/InterleavedContent"
+                        }
                     },
-                    "description": {
-                        "type": "string"
+                    "sampling_params": {
+                        "$ref": "#/components/schemas/SamplingParams"
                     },
-                    "required": {
-                        "type": "boolean",
-                        "default": true
+                    "response_format": {
+                        "$ref": "#/components/schemas/ResponseFormat"
                     },
-                    "default": {
-                        "oneOf": [
-                            {
-                                "type": "null"
-                            },
-                            {
-                                "type": "boolean"
-                            },
-                            {
-                                "type": "number"
-                            },
-                            {
-                                "type": "string"
-                            },
-                            {
-                                "type": "array"
-                            },
-                            {
-                                "type": "object"
+                    "logprobs": {
+                        "type": "object",
+                        "properties": {
+                            "top_k": {
+                                "type": "integer",
+                                "default": 0,
+                                "description": "How many tokens (for each position) to return log probabilities for."
                             }
-                        ]
+                        },
+                        "additionalProperties": false
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "name",
-                    "parameter_type",
-                    "description",
-                    "required"
+                    "model",
+                    "content_batch"
                 ]
             },
-            "CreateAgentRequest": {
+            "BatchCompletionResponse": {
                 "type": "object",
                 "properties": {
-                    "agent_config": {
-                        "$ref": "#/components/schemas/AgentConfig"
+                    "batch": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/CompletionResponse"
+                        }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "agent_config"
+                    "batch"
                 ]
             },
-            "AgentCreateResponse": {
+            "CompletionResponse": {
                 "type": "object",
                 "properties": {
-                    "agent_id": {
-                        "type": "string"
+                    "content": {
+                        "type": "string",
+                        "description": "The generated completion text"
+                    },
+                    "stop_reason": {
+                        "type": "string",
+                        "enum": [
+                            "end_of_turn",
+                            "end_of_message",
+                            "out_of_tokens"
+                        ],
+                        "description": "Reason why generation stopped"
+                    },
+                    "logprobs": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/TokenLogProbs"
+                        },
+                        "description": "Optional log probabilities for generated tokens"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "agent_id"
-                ]
+                    "content",
+                    "stop_reason"
+                ],
+                "description": "Response from a completion request."
             },
-            "CreateAgentSessionRequest": {
+            "CancelTrainingJobRequest": {
                 "type": "object",
                 "properties": {
-                    "session_name": {
+                    "job_uuid": {
                         "type": "string"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "session_name"
+                    "job_uuid"
                 ]
             },
-            "AgentSessionCreateResponse": {
+            "ChatCompletionRequest": {
                 "type": "object",
                 "properties": {
-                    "session_id": {
-                        "type": "string"
+                    "model_id": {
+                        "type": "string",
+                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
+                    },
+                    "messages": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/Message"
+                        },
+                        "description": "List of messages in the conversation"
+                    },
+                    "sampling_params": {
+                        "$ref": "#/components/schemas/SamplingParams",
+                        "description": "Parameters to control the sampling strategy"
+                    },
+                    "tools": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/ToolDefinition"
+                        },
+                        "description": "(Optional) List of tool definitions available to the model"
+                    },
+                    "tool_choice": {
+                        "type": "string",
+                        "enum": [
+                            "auto",
+                            "required"
+                        ],
+                        "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead."
+                    },
+                    "tool_prompt_format": {
+                        "type": "string",
+                        "enum": [
+                            "json",
+                            "function_tag",
+                            "python_list"
+                        ],
+                        "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls. .. deprecated:: Use tool_config instead."
+                    },
+                    "response_format": {
+                        "$ref": "#/components/schemas/ResponseFormat",
+                        "description": "(Optional) Grammar specification for guided (structured) decoding. There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it."
+                    },
+                    "stream": {
+                        "type": "boolean",
+                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
+                    },
+                    "logprobs": {
+                        "type": "object",
+                        "properties": {
+                            "top_k": {
+                                "type": "integer",
+                                "default": 0,
+                                "description": "How many tokens (for each position) to return log probabilities for."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
+                    },
+                    "tool_config": {
+                        "$ref": "#/components/schemas/ToolConfig",
+                        "description": "(Optional) Configuration for tool use."
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "session_id"
+                    "model_id",
+                    "messages"
                 ]
             },
-            "CreateAgentTurnRequest": {
+            "ChatCompletionResponseEvent": {
                 "type": "object",
                 "properties": {
-                    "messages": {
-                        "type": "array",
-                        "items": {
-                            "oneOf": [
-                                {
-                                    "$ref": "#/components/schemas/UserMessage"
-                                },
-                                {
-                                    "$ref": "#/components/schemas/ToolResponseMessage"
-                                }
-                            ]
-                        }
-                    },
-                    "stream": {
-                        "type": "boolean"
-                    },
-                    "documents": {
-                        "type": "array",
-                        "items": {
-                            "type": "object",
-                            "properties": {
-                                "content": {
-                                    "oneOf": [
-                                        {
-                                            "type": "string"
-                                        },
-                                        {
-                                            "$ref": "#/components/schemas/InterleavedContentItem"
-                                        },
-                                        {
-                                            "type": "array",
-                                            "items": {
-                                                "$ref": "#/components/schemas/InterleavedContentItem"
-                                            }
-                                        },
-                                        {
-                                            "$ref": "#/components/schemas/URL"
-                                        }
-                                    ]
-                                },
-                                "mime_type": {
-                                    "type": "string"
-                                }
-                            },
-                            "additionalProperties": false,
-                            "required": [
-                                "content",
-                                "mime_type"
-                            ]
-                        }
+                    "event_type": {
+                        "type": "string",
+                        "enum": [
+                            "start",
+                            "complete",
+                            "progress"
+                        ],
+                        "description": "Type of the event"
                     },
-                    "toolgroups": {
+                    "delta": {
+                        "$ref": "#/components/schemas/ContentDelta",
+                        "description": "Content generated since last event. This can be one or more tokens, or a tool call."
+                    },
+                    "logprobs": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/AgentTool"
-                        }
+                            "$ref": "#/components/schemas/TokenLogProbs"
+                        },
+                        "description": "Optional log probabilities for generated tokens"
                     },
-                    "tool_config": {
-                        "$ref": "#/components/schemas/ToolConfig"
+                    "stop_reason": {
+                        "type": "string",
+                        "enum": [
+                            "end_of_turn",
+                            "end_of_message",
+                            "out_of_tokens"
+                        ],
+                        "description": "Optional reason why generation stopped, if complete"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "messages"
-                ]
+                    "event_type",
+                    "delta"
+                ],
+                "description": "An event during chat completion generation."
             },
-            "InferenceStep": {
+            "ChatCompletionResponseStreamChunk": {
                 "type": "object",
                 "properties": {
-                    "turn_id": {
-                        "type": "string"
-                    },
-                    "step_id": {
-                        "type": "string"
+                    "event": {
+                        "$ref": "#/components/schemas/ChatCompletionResponseEvent",
+                        "description": "The event containing the new content"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "event"
+                ],
+                "description": "A chunk of a streamed chat completion response."
+            },
+            "ContentDelta": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/TextDelta"
                     },
-                    "started_at": {
-                        "type": "string",
-                        "format": "date-time"
+                    {
+                        "$ref": "#/components/schemas/ImageDelta"
                     },
-                    "completed_at": {
+                    {
+                        "$ref": "#/components/schemas/ToolCallDelta"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "text": "#/components/schemas/TextDelta",
+                        "image": "#/components/schemas/ImageDelta",
+                        "tool_call": "#/components/schemas/ToolCallDelta"
+                    }
+                }
+            },
+            "ImageDelta": {
+                "type": "object",
+                "properties": {
+                    "type": {
                         "type": "string",
-                        "format": "date-time"
+                        "const": "image",
+                        "default": "image"
                     },
-                    "step_type": {
+                    "image": {
                         "type": "string",
-                        "const": "inference",
-                        "default": "inference"
-                    },
-                    "model_response": {
-                        "$ref": "#/components/schemas/CompletionMessage"
+                        "contentEncoding": "base64"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "turn_id",
-                    "step_id",
-                    "step_type",
-                    "model_response"
+                    "type",
+                    "image"
                 ]
             },
-            "MemoryRetrievalStep": {
+            "TextDelta": {
                 "type": "object",
                 "properties": {
-                    "turn_id": {
-                        "type": "string"
+                    "type": {
+                        "type": "string",
+                        "const": "text",
+                        "default": "text"
                     },
-                    "step_id": {
+                    "text": {
                         "type": "string"
-                    },
-                    "started_at": {
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "text"
+                ]
+            },
+            "ToolCallDelta": {
+                "type": "object",
+                "properties": {
+                    "type": {
                         "type": "string",
-                        "format": "date-time"
+                        "const": "tool_call",
+                        "default": "tool_call"
                     },
-                    "completed_at": {
-                        "type": "string",
-                        "format": "date-time"
+                    "tool_call": {
+                        "oneOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "$ref": "#/components/schemas/ToolCall"
+                            }
+                        ]
                     },
-                    "step_type": {
+                    "parse_status": {
                         "type": "string",
-                        "const": "memory_retrieval",
-                        "default": "memory_retrieval"
-                    },
-                    "vector_db_ids": {
-                        "type": "string"
-                    },
-                    "inserted_context": {
-                        "$ref": "#/components/schemas/InterleavedContent"
+                        "enum": [
+                            "started",
+                            "in_progress",
+                            "failed",
+                            "succeeded"
+                        ]
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "turn_id",
-                    "step_id",
-                    "step_type",
-                    "vector_db_ids",
-                    "inserted_context"
+                    "type",
+                    "tool_call",
+                    "parse_status"
                 ]
             },
-            "SafetyViolation": {
+            "CompletionRequest": {
                 "type": "object",
                 "properties": {
-                    "violation_level": {
-                        "$ref": "#/components/schemas/ViolationLevel"
+                    "model_id": {
+                        "type": "string",
+                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
                     },
-                    "user_message": {
-                        "type": "string"
+                    "content": {
+                        "$ref": "#/components/schemas/InterleavedContent",
+                        "description": "The content to generate a completion for"
                     },
-                    "metadata": {
+                    "sampling_params": {
+                        "$ref": "#/components/schemas/SamplingParams",
+                        "description": "(Optional) Parameters to control the sampling strategy"
+                    },
+                    "response_format": {
+                        "$ref": "#/components/schemas/ResponseFormat",
+                        "description": "(Optional) Grammar specification for guided (structured) decoding"
+                    },
+                    "stream": {
+                        "type": "boolean",
+                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
+                    },
+                    "logprobs": {
                         "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        }
+                        "properties": {
+                            "top_k": {
+                                "type": "integer",
+                                "default": 0,
+                                "description": "How many tokens (for each position) to return log probabilities for."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "violation_level",
-                    "metadata"
+                    "model_id",
+                    "content"
                 ]
             },
-            "ShieldCallStep": {
+            "CompletionResponseStreamChunk": {
                 "type": "object",
                 "properties": {
-                    "turn_id": {
-                        "type": "string"
-                    },
-                    "step_id": {
-                        "type": "string"
-                    },
-                    "started_at": {
-                        "type": "string",
-                        "format": "date-time"
-                    },
-                    "completed_at": {
+                    "delta": {
                         "type": "string",
-                        "format": "date-time"
+                        "description": "New content generated since last chunk. This can be one or more tokens."
                     },
-                    "step_type": {
+                    "stop_reason": {
                         "type": "string",
-                        "const": "shield_call",
-                        "default": "shield_call"
+                        "enum": [
+                            "end_of_turn",
+                            "end_of_message",
+                            "out_of_tokens"
+                        ],
+                        "description": "Optional reason why generation stopped, if complete"
                     },
-                    "violation": {
-                        "$ref": "#/components/schemas/SafetyViolation"
+                    "logprobs": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/TokenLogProbs"
+                        },
+                        "description": "Optional log probabilities for generated tokens"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "delta"
+                ],
+                "description": "A chunk of a streamed completion response."
+            },
+            "CreateAgentRequest": {
+                "type": "object",
+                "properties": {
+                    "agent_config": {
+                        "$ref": "#/components/schemas/AgentConfig"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "turn_id",
-                    "step_id",
-                    "step_type"
+                    "agent_config"
                 ]
             },
-            "ToolExecutionStep": {
+            "AgentCreateResponse": {
                 "type": "object",
                 "properties": {
-                    "turn_id": {
-                        "type": "string"
-                    },
-                    "step_id": {
+                    "agent_id": {
                         "type": "string"
-                    },
-                    "started_at": {
-                        "type": "string",
-                        "format": "date-time"
-                    },
-                    "completed_at": {
-                        "type": "string",
-                        "format": "date-time"
-                    },
-                    "step_type": {
-                        "type": "string",
-                        "const": "tool_execution",
-                        "default": "tool_execution"
-                    },
-                    "tool_calls": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/ToolCall"
-                        }
-                    },
-                    "tool_responses": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/ToolResponse"
-                        }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "turn_id",
-                    "step_id",
-                    "step_type",
-                    "tool_calls",
-                    "tool_responses"
+                    "agent_id"
                 ]
             },
-            "ToolResponse": {
+            "CreateAgentSessionRequest": {
                 "type": "object",
                 "properties": {
-                    "call_id": {
+                    "session_name": {
                         "type": "string"
-                    },
-                    "tool_name": {
-                        "oneOf": [
-                            {
-                                "type": "string",
-                                "enum": [
-                                    "brave_search",
-                                    "wolfram_alpha",
-                                    "photogen",
-                                    "code_interpreter"
-                                ]
-                            },
-                            {
-                                "type": "string"
-                            }
-                        ]
-                    },
-                    "content": {
-                        "$ref": "#/components/schemas/InterleavedContent"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "call_id",
-                    "tool_name",
-                    "content"
+                    "session_name"
                 ]
             },
-            "Turn": {
+            "AgentSessionCreateResponse": {
                 "type": "object",
                 "properties": {
-                    "turn_id": {
-                        "type": "string"
-                    },
                     "session_id": {
                         "type": "string"
-                    },
-                    "input_messages": {
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "session_id"
+                ]
+            },
+            "CreateAgentTurnRequest": {
+                "type": "object",
+                "properties": {
+                    "messages": {
                         "type": "array",
                         "items": {
                             "oneOf": [
@@ -4551,38 +4632,10 @@
                             ]
                         }
                     },
-                    "steps": {
-                        "type": "array",
-                        "items": {
-                            "oneOf": [
-                                {
-                                    "$ref": "#/components/schemas/InferenceStep"
-                                },
-                                {
-                                    "$ref": "#/components/schemas/ToolExecutionStep"
-                                },
-                                {
-                                    "$ref": "#/components/schemas/ShieldCallStep"
-                                },
-                                {
-                                    "$ref": "#/components/schemas/MemoryRetrievalStep"
-                                }
-                            ],
-                            "discriminator": {
-                                "propertyName": "step_type",
-                                "mapping": {
-                                    "inference": "#/components/schemas/InferenceStep",
-                                    "tool_execution": "#/components/schemas/ToolExecutionStep",
-                                    "shield_call": "#/components/schemas/ShieldCallStep",
-                                    "memory_retrieval": "#/components/schemas/MemoryRetrievalStep"
-                                }
-                            }
-                        }
-                    },
-                    "output_message": {
-                        "$ref": "#/components/schemas/CompletionMessage"
+                    "stream": {
+                        "type": "boolean"
                     },
-                    "output_attachments": {
+                    "documents": {
                         "type": "array",
                         "items": {
                             "type": "object",
@@ -4617,179 +4670,100 @@
                             ]
                         }
                     },
-                    "started_at": {
-                        "type": "string",
-                        "format": "date-time"
+                    "toolgroups": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/AgentTool"
+                        }
                     },
-                    "completed_at": {
-                        "type": "string",
-                        "format": "date-time"
+                    "tool_config": {
+                        "$ref": "#/components/schemas/ToolConfig"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "turn_id",
-                    "session_id",
-                    "input_messages",
-                    "steps",
-                    "output_message",
-                    "started_at"
-                ],
-                "description": "A single turn in an interaction with an Agentic System."
-            },
-            "ViolationLevel": {
-                "type": "string",
-                "enum": [
-                    "info",
-                    "warn",
-                    "error"
+                    "messages"
                 ]
             },
-            "AgentTurnResponseEvent": {
+            "InferenceStep": {
                 "type": "object",
                 "properties": {
-                    "payload": {
-                        "$ref": "#/components/schemas/AgentTurnResponseEventPayload"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "payload"
-                ]
-            },
-            "AgentTurnResponseEventPayload": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/AgentTurnResponseStepStartPayload"
-                    },
-                    {
-                        "$ref": "#/components/schemas/AgentTurnResponseStepProgressPayload"
+                    "turn_id": {
+                        "type": "string"
                     },
-                    {
-                        "$ref": "#/components/schemas/AgentTurnResponseStepCompletePayload"
+                    "step_id": {
+                        "type": "string"
                     },
-                    {
-                        "$ref": "#/components/schemas/AgentTurnResponseTurnStartPayload"
+                    "started_at": {
+                        "type": "string",
+                        "format": "date-time"
                     },
-                    {
-                        "$ref": "#/components/schemas/AgentTurnResponseTurnCompletePayload"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "event_type",
-                    "mapping": {
-                        "step_start": "#/components/schemas/AgentTurnResponseStepStartPayload",
-                        "step_progress": "#/components/schemas/AgentTurnResponseStepProgressPayload",
-                        "step_complete": "#/components/schemas/AgentTurnResponseStepCompletePayload",
-                        "turn_start": "#/components/schemas/AgentTurnResponseTurnStartPayload",
-                        "turn_complete": "#/components/schemas/AgentTurnResponseTurnCompletePayload"
-                    }
-                }
-            },
-            "AgentTurnResponseStepCompletePayload": {
-                "type": "object",
-                "properties": {
-                    "event_type": {
+                    "completed_at": {
                         "type": "string",
-                        "const": "step_complete",
-                        "default": "step_complete"
+                        "format": "date-time"
                     },
                     "step_type": {
                         "type": "string",
-                        "enum": [
-                            "inference",
-                            "tool_execution",
-                            "shield_call",
-                            "memory_retrieval"
-                        ]
-                    },
-                    "step_id": {
-                        "type": "string"
+                        "const": "inference",
+                        "default": "inference"
                     },
-                    "step_details": {
-                        "oneOf": [
-                            {
-                                "$ref": "#/components/schemas/InferenceStep"
-                            },
-                            {
-                                "$ref": "#/components/schemas/ToolExecutionStep"
-                            },
-                            {
-                                "$ref": "#/components/schemas/ShieldCallStep"
-                            },
-                            {
-                                "$ref": "#/components/schemas/MemoryRetrievalStep"
-                            }
-                        ],
-                        "discriminator": {
-                            "propertyName": "step_type",
-                            "mapping": {
-                                "inference": "#/components/schemas/InferenceStep",
-                                "tool_execution": "#/components/schemas/ToolExecutionStep",
-                                "shield_call": "#/components/schemas/ShieldCallStep",
-                                "memory_retrieval": "#/components/schemas/MemoryRetrievalStep"
-                            }
-                        }
+                    "model_response": {
+                        "$ref": "#/components/schemas/CompletionMessage"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "event_type",
-                    "step_type",
+                    "turn_id",
                     "step_id",
-                    "step_details"
+                    "step_type",
+                    "model_response"
                 ]
             },
-            "AgentTurnResponseStepProgressPayload": {
+            "MemoryRetrievalStep": {
                 "type": "object",
                 "properties": {
-                    "event_type": {
+                    "turn_id": {
+                        "type": "string"
+                    },
+                    "step_id": {
+                        "type": "string"
+                    },
+                    "started_at": {
                         "type": "string",
-                        "const": "step_progress",
-                        "default": "step_progress"
+                        "format": "date-time"
+                    },
+                    "completed_at": {
+                        "type": "string",
+                        "format": "date-time"
                     },
                     "step_type": {
                         "type": "string",
-                        "enum": [
-                            "inference",
-                            "tool_execution",
-                            "shield_call",
-                            "memory_retrieval"
-                        ]
+                        "const": "memory_retrieval",
+                        "default": "memory_retrieval"
                     },
-                    "step_id": {
+                    "vector_db_ids": {
                         "type": "string"
                     },
-                    "delta": {
-                        "$ref": "#/components/schemas/ContentDelta"
+                    "inserted_context": {
+                        "$ref": "#/components/schemas/InterleavedContent"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "event_type",
-                    "step_type",
+                    "turn_id",
                     "step_id",
-                    "delta"
+                    "step_type",
+                    "vector_db_ids",
+                    "inserted_context"
                 ]
             },
-            "AgentTurnResponseStepStartPayload": {
+            "SafetyViolation": {
                 "type": "object",
                 "properties": {
-                    "event_type": {
-                        "type": "string",
-                        "const": "step_start",
-                        "default": "step_start"
-                    },
-                    "step_type": {
-                        "type": "string",
-                        "enum": [
-                            "inference",
-                            "tool_execution",
-                            "shield_call",
-                            "memory_retrieval"
-                        ]
+                    "violation_level": {
+                        "$ref": "#/components/schemas/ViolationLevel"
                     },
-                    "step_id": {
+                    "user_message": {
                         "type": "string"
                     },
                     "metadata": {
@@ -4820,416 +4794,384 @@
                 },
                 "additionalProperties": false,
                 "required": [
-                    "event_type",
-                    "step_type",
-                    "step_id"
-                ]
-            },
-            "AgentTurnResponseStreamChunk": {
-                "type": "object",
-                "properties": {
-                    "event": {
-                        "$ref": "#/components/schemas/AgentTurnResponseEvent"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "event"
-                ],
-                "description": "streamed agent turn completion response."
-            },
-            "AgentTurnResponseTurnCompletePayload": {
-                "type": "object",
-                "properties": {
-                    "event_type": {
-                        "type": "string",
-                        "const": "turn_complete",
-                        "default": "turn_complete"
-                    },
-                    "turn": {
-                        "$ref": "#/components/schemas/Turn"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "event_type",
-                    "turn"
+                    "violation_level",
+                    "metadata"
                 ]
             },
-            "AgentTurnResponseTurnStartPayload": {
+            "ShieldCallStep": {
                 "type": "object",
                 "properties": {
-                    "event_type": {
-                        "type": "string",
-                        "const": "turn_start",
-                        "default": "turn_start"
-                    },
                     "turn_id": {
                         "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "event_type",
-                    "turn_id"
-                ]
-            },
-            "EmbeddingsRequest": {
-                "type": "object",
-                "properties": {
-                    "model_id": {
+                    },
+                    "step_id": {
+                        "type": "string"
+                    },
+                    "started_at": {
                         "type": "string",
-                        "description": "The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint."
+                        "format": "date-time"
                     },
-                    "contents": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/InterleavedContent"
-                        },
-                        "description": "List of contents to generate embeddings for. Note that content can be multimodal. The behavior depends on the model and provider. Some models may only support text."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "model_id",
-                    "contents"
-                ]
-            },
-            "EmbeddingsResponse": {
-                "type": "object",
-                "properties": {
-                    "embeddings": {
-                        "type": "array",
-                        "items": {
-                            "type": "array",
-                            "items": {
-                                "type": "number"
-                            }
-                        },
-                        "description": "List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "embeddings"
-                ],
-                "description": "Response containing generated embeddings."
-            },
-            "AgentCandidate": {
-                "type": "object",
-                "properties": {
-                    "type": {
+                    "completed_at": {
                         "type": "string",
-                        "const": "agent",
-                        "default": "agent"
+                        "format": "date-time"
                     },
-                    "config": {
-                        "$ref": "#/components/schemas/AgentConfig"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "config"
-                ]
-            },
-            "AggregationFunctionType": {
-                "type": "string",
-                "enum": [
-                    "average",
-                    "median",
-                    "categorical_count",
-                    "accuracy"
-                ]
-            },
-            "BasicScoringFnParams": {
-                "type": "object",
-                "properties": {
-                    "type": {
+                    "step_type": {
                         "type": "string",
-                        "const": "basic",
-                        "default": "basic"
+                        "const": "shield_call",
+                        "default": "shield_call"
                     },
-                    "aggregation_functions": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/AggregationFunctionType"
-                        }
+                    "violation": {
+                        "$ref": "#/components/schemas/SafetyViolation"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "type"
+                    "turn_id",
+                    "step_id",
+                    "step_type"
                 ]
             },
-            "BenchmarkConfig": {
+            "ToolExecutionStep": {
                 "type": "object",
                 "properties": {
-                    "eval_candidate": {
-                        "$ref": "#/components/schemas/EvalCandidate"
-                    },
-                    "scoring_params": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ScoringFnParams"
-                        }
+                    "turn_id": {
+                        "type": "string"
                     },
-                    "num_examples": {
-                        "type": "integer"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "eval_candidate",
-                    "scoring_params"
-                ]
-            },
-            "EvalCandidate": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/ModelCandidate"
+                    "step_id": {
+                        "type": "string"
                     },
-                    {
-                        "$ref": "#/components/schemas/AgentCandidate"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "model": "#/components/schemas/ModelCandidate",
-                        "agent": "#/components/schemas/AgentCandidate"
-                    }
-                }
-            },
-            "LLMAsJudgeScoringFnParams": {
-                "type": "object",
-                "properties": {
-                    "type": {
+                    "started_at": {
                         "type": "string",
-                        "const": "llm_as_judge",
-                        "default": "llm_as_judge"
+                        "format": "date-time"
                     },
-                    "judge_model": {
-                        "type": "string"
+                    "completed_at": {
+                        "type": "string",
+                        "format": "date-time"
                     },
-                    "prompt_template": {
-                        "type": "string"
+                    "step_type": {
+                        "type": "string",
+                        "const": "tool_execution",
+                        "default": "tool_execution"
                     },
-                    "judge_score_regexes": {
+                    "tool_calls": {
                         "type": "array",
                         "items": {
-                            "type": "string"
+                            "$ref": "#/components/schemas/ToolCall"
                         }
                     },
-                    "aggregation_functions": {
+                    "tool_responses": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/AggregationFunctionType"
+                            "$ref": "#/components/schemas/ToolResponse"
                         }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "type",
-                    "judge_model"
+                    "turn_id",
+                    "step_id",
+                    "step_type",
+                    "tool_calls",
+                    "tool_responses"
                 ]
             },
-            "ModelCandidate": {
+            "ToolResponse": {
                 "type": "object",
                 "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "model",
-                        "default": "model"
-                    },
-                    "model": {
+                    "call_id": {
                         "type": "string"
                     },
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams"
+                    "tool_name": {
+                        "oneOf": [
+                            {
+                                "type": "string",
+                                "enum": [
+                                    "brave_search",
+                                    "wolfram_alpha",
+                                    "photogen",
+                                    "code_interpreter"
+                                ]
+                            },
+                            {
+                                "type": "string"
+                            }
+                        ]
                     },
-                    "system_message": {
-                        "$ref": "#/components/schemas/SystemMessage"
+                    "content": {
+                        "$ref": "#/components/schemas/InterleavedContent"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "type",
-                    "model",
-                    "sampling_params"
+                    "call_id",
+                    "tool_name",
+                    "content"
                 ]
             },
-            "RegexParserScoringFnParams": {
+            "Turn": {
                 "type": "object",
                 "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "regex_parser",
-                        "default": "regex_parser"
+                    "turn_id": {
+                        "type": "string"
                     },
-                    "parsing_regexes": {
+                    "session_id": {
+                        "type": "string"
+                    },
+                    "input_messages": {
                         "type": "array",
                         "items": {
-                            "type": "string"
+                            "oneOf": [
+                                {
+                                    "$ref": "#/components/schemas/UserMessage"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/ToolResponseMessage"
+                                }
+                            ]
                         }
                     },
-                    "aggregation_functions": {
+                    "steps": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/AggregationFunctionType"
+                            "oneOf": [
+                                {
+                                    "$ref": "#/components/schemas/InferenceStep"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/ToolExecutionStep"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/ShieldCallStep"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/MemoryRetrievalStep"
+                                }
+                            ],
+                            "discriminator": {
+                                "propertyName": "step_type",
+                                "mapping": {
+                                    "inference": "#/components/schemas/InferenceStep",
+                                    "tool_execution": "#/components/schemas/ToolExecutionStep",
+                                    "shield_call": "#/components/schemas/ShieldCallStep",
+                                    "memory_retrieval": "#/components/schemas/MemoryRetrievalStep"
+                                }
+                            }
+                        }
+                    },
+                    "output_message": {
+                        "$ref": "#/components/schemas/CompletionMessage"
+                    },
+                    "output_attachments": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "properties": {
+                                "content": {
+                                    "oneOf": [
+                                        {
+                                            "type": "string"
+                                        },
+                                        {
+                                            "$ref": "#/components/schemas/InterleavedContentItem"
+                                        },
+                                        {
+                                            "type": "array",
+                                            "items": {
+                                                "$ref": "#/components/schemas/InterleavedContentItem"
+                                            }
+                                        },
+                                        {
+                                            "$ref": "#/components/schemas/URL"
+                                        }
+                                    ]
+                                },
+                                "mime_type": {
+                                    "type": "string"
+                                }
+                            },
+                            "additionalProperties": false,
+                            "required": [
+                                "content",
+                                "mime_type"
+                            ]
                         }
+                    },
+                    "started_at": {
+                        "type": "string",
+                        "format": "date-time"
+                    },
+                    "completed_at": {
+                        "type": "string",
+                        "format": "date-time"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "type"
+                    "turn_id",
+                    "session_id",
+                    "input_messages",
+                    "steps",
+                    "output_message",
+                    "started_at"
+                ],
+                "description": "A single turn in an interaction with an Agentic System."
+            },
+            "ViolationLevel": {
+                "type": "string",
+                "enum": [
+                    "info",
+                    "warn",
+                    "error"
                 ]
             },
-            "ScoringFnParams": {
+            "AgentTurnResponseEvent": {
+                "type": "object",
+                "properties": {
+                    "payload": {
+                        "$ref": "#/components/schemas/AgentTurnResponseEventPayload"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "payload"
+                ]
+            },
+            "AgentTurnResponseEventPayload": {
                 "oneOf": [
                     {
-                        "$ref": "#/components/schemas/LLMAsJudgeScoringFnParams"
+                        "$ref": "#/components/schemas/AgentTurnResponseStepStartPayload"
                     },
                     {
-                        "$ref": "#/components/schemas/RegexParserScoringFnParams"
+                        "$ref": "#/components/schemas/AgentTurnResponseStepProgressPayload"
                     },
                     {
-                        "$ref": "#/components/schemas/BasicScoringFnParams"
+                        "$ref": "#/components/schemas/AgentTurnResponseStepCompletePayload"
+                    },
+                    {
+                        "$ref": "#/components/schemas/AgentTurnResponseTurnStartPayload"
+                    },
+                    {
+                        "$ref": "#/components/schemas/AgentTurnResponseTurnCompletePayload"
                     }
                 ],
                 "discriminator": {
-                    "propertyName": "type",
+                    "propertyName": "event_type",
                     "mapping": {
-                        "llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams",
-                        "regex_parser": "#/components/schemas/RegexParserScoringFnParams",
-                        "basic": "#/components/schemas/BasicScoringFnParams"
+                        "step_start": "#/components/schemas/AgentTurnResponseStepStartPayload",
+                        "step_progress": "#/components/schemas/AgentTurnResponseStepProgressPayload",
+                        "step_complete": "#/components/schemas/AgentTurnResponseStepCompletePayload",
+                        "turn_start": "#/components/schemas/AgentTurnResponseTurnStartPayload",
+                        "turn_complete": "#/components/schemas/AgentTurnResponseTurnCompletePayload"
                     }
                 }
             },
-            "EvaluateRowsRequest": {
+            "AgentTurnResponseStepCompletePayload": {
                 "type": "object",
                 "properties": {
-                    "input_rows": {
-                        "type": "array",
-                        "items": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "null"
-                                    },
-                                    {
-                                        "type": "boolean"
-                                    },
-                                    {
-                                        "type": "number"
-                                    },
-                                    {
-                                        "type": "string"
-                                    },
-                                    {
-                                        "type": "array"
-                                    },
-                                    {
-                                        "type": "object"
-                                    }
-                                ]
+                    "event_type": {
+                        "type": "string",
+                        "const": "step_complete",
+                        "default": "step_complete"
+                    },
+                    "step_type": {
+                        "type": "string",
+                        "enum": [
+                            "inference",
+                            "tool_execution",
+                            "shield_call",
+                            "memory_retrieval"
+                        ]
+                    },
+                    "step_id": {
+                        "type": "string"
+                    },
+                    "step_details": {
+                        "oneOf": [
+                            {
+                                "$ref": "#/components/schemas/InferenceStep"
+                            },
+                            {
+                                "$ref": "#/components/schemas/ToolExecutionStep"
+                            },
+                            {
+                                "$ref": "#/components/schemas/ShieldCallStep"
+                            },
+                            {
+                                "$ref": "#/components/schemas/MemoryRetrievalStep"
+                            }
+                        ],
+                        "discriminator": {
+                            "propertyName": "step_type",
+                            "mapping": {
+                                "inference": "#/components/schemas/InferenceStep",
+                                "tool_execution": "#/components/schemas/ToolExecutionStep",
+                                "shield_call": "#/components/schemas/ShieldCallStep",
+                                "memory_retrieval": "#/components/schemas/MemoryRetrievalStep"
                             }
                         }
-                    },
-                    "scoring_functions": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        }
-                    },
-                    "task_config": {
-                        "$ref": "#/components/schemas/BenchmarkConfig"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "input_rows",
-                    "scoring_functions",
-                    "task_config"
+                    "event_type",
+                    "step_type",
+                    "step_id",
+                    "step_details"
                 ]
             },
-            "EvaluateResponse": {
+            "AgentTurnResponseStepProgressPayload": {
                 "type": "object",
                 "properties": {
-                    "generations": {
-                        "type": "array",
-                        "items": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "null"
-                                    },
-                                    {
-                                        "type": "boolean"
-                                    },
-                                    {
-                                        "type": "number"
-                                    },
-                                    {
-                                        "type": "string"
-                                    },
-                                    {
-                                        "type": "array"
-                                    },
-                                    {
-                                        "type": "object"
-                                    }
-                                ]
-                            }
-                        }
+                    "event_type": {
+                        "type": "string",
+                        "const": "step_progress",
+                        "default": "step_progress"
                     },
-                    "scores": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ScoringResult"
-                        }
+                    "step_type": {
+                        "type": "string",
+                        "enum": [
+                            "inference",
+                            "tool_execution",
+                            "shield_call",
+                            "memory_retrieval"
+                        ]
+                    },
+                    "step_id": {
+                        "type": "string"
+                    },
+                    "delta": {
+                        "$ref": "#/components/schemas/ContentDelta"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "generations",
-                    "scores"
+                    "event_type",
+                    "step_type",
+                    "step_id",
+                    "delta"
                 ]
             },
-            "ScoringResult": {
+            "AgentTurnResponseStepStartPayload": {
                 "type": "object",
                 "properties": {
-                    "score_rows": {
-                        "type": "array",
-                        "items": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "null"
-                                    },
-                                    {
-                                        "type": "boolean"
-                                    },
-                                    {
-                                        "type": "number"
-                                    },
-                                    {
-                                        "type": "string"
-                                    },
-                                    {
-                                        "type": "array"
-                                    },
-                                    {
-                                        "type": "object"
-                                    }
-                                ]
-                            }
-                        }
+                    "event_type": {
+                        "type": "string",
+                        "const": "step_start",
+                        "default": "step_start"
                     },
-                    "aggregated_results": {
+                    "step_type": {
+                        "type": "string",
+                        "enum": [
+                            "inference",
+                            "tool_execution",
+                            "shield_call",
+                            "memory_retrieval"
+                        ]
+                    },
+                    "step_id": {
+                        "type": "string"
+                    },
+                    "metadata": {
                         "type": "object",
                         "additionalProperties": {
                             "oneOf": [
@@ -5257,11 +5199,102 @@
                 },
                 "additionalProperties": false,
                 "required": [
-                    "score_rows",
-                    "aggregated_results"
+                    "event_type",
+                    "step_type",
+                    "step_id"
+                ]
+            },
+            "AgentTurnResponseStreamChunk": {
+                "type": "object",
+                "properties": {
+                    "event": {
+                        "$ref": "#/components/schemas/AgentTurnResponseEvent"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "event"
+                ],
+                "description": "streamed agent turn completion response."
+            },
+            "AgentTurnResponseTurnCompletePayload": {
+                "type": "object",
+                "properties": {
+                    "event_type": {
+                        "type": "string",
+                        "const": "turn_complete",
+                        "default": "turn_complete"
+                    },
+                    "turn": {
+                        "$ref": "#/components/schemas/Turn"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "event_type",
+                    "turn"
+                ]
+            },
+            "AgentTurnResponseTurnStartPayload": {
+                "type": "object",
+                "properties": {
+                    "event_type": {
+                        "type": "string",
+                        "const": "turn_start",
+                        "default": "turn_start"
+                    },
+                    "turn_id": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "event_type",
+                    "turn_id"
+                ]
+            },
+            "EmbeddingsRequest": {
+                "type": "object",
+                "properties": {
+                    "model_id": {
+                        "type": "string",
+                        "description": "The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint."
+                    },
+                    "contents": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/InterleavedContent"
+                        },
+                        "description": "List of contents to generate embeddings for. Note that content can be multimodal. The behavior depends on the model and provider. Some models may only support text."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "model_id",
+                    "contents"
                 ]
             },
-            "EvaluateRowsDeprecatedRequest": {
+            "EmbeddingsResponse": {
+                "type": "object",
+                "properties": {
+                    "embeddings": {
+                        "type": "array",
+                        "items": {
+                            "type": "array",
+                            "items": {
+                                "type": "number"
+                            }
+                        },
+                        "description": "List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "embeddings"
+                ],
+                "description": "Response containing generated embeddings."
+            },
+            "EvaluateRowsRequest": {
                 "type": "object",
                 "properties": {
                     "input_rows": {
@@ -6165,15 +6198,6 @@
                 ],
                 "description": "Artifacts of a finetuning job."
             },
-            "JobStatus": {
-                "type": "string",
-                "enum": [
-                    "completed",
-                    "in_progress",
-                    "failed",
-                    "scheduled"
-                ]
-            },
             "PostTrainingJobStatusResponse": {
                 "type": "object",
                 "properties": {
@@ -7812,30 +7836,6 @@
                     "task_config"
                 ]
             },
-            "Job": {
-                "type": "object",
-                "properties": {
-                    "job_id": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "job_id"
-                ]
-            },
-            "RunEvalDeprecatedRequest": {
-                "type": "object",
-                "properties": {
-                    "task_config": {
-                        "$ref": "#/components/schemas/BenchmarkConfig"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "task_config"
-                ]
-            },
             "RunShieldRequest": {
                 "type": "object",
                 "properties": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 89e0669177..83bc5483c8 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -10,6 +10,31 @@ info:
 servers:
   - url: http://any-hosted-llama-stack.com
 paths:
+  /v1/eval/tasks/{task_id}/evaluations:
+    post:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/EvaluateResponse'
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/DeprecatedEvaluateRowsRequest'
+        required: true
+      deprecated: true
   /v1/eval-tasks/{benchmark_id}:
     get:
       responses:
@@ -31,6 +56,75 @@ paths:
           schema:
             type: string
       deprecated: true
+  /v1/eval/tasks/{task_id}/jobs/{job_id}:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: '#/components/schemas/JobStatus'
+                  - type: 'null'
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+      deprecated: true
+    delete:
+      responses:
+        '200':
+          description: OK
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+      deprecated: true
+  /v1/eval/tasks/{task_id}/jobs/{job_id}/result:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/EvaluateResponse'
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+      deprecated: true
   /v1/eval-tasks:
     get:
       responses:
@@ -60,6 +154,31 @@ paths:
               $ref: '#/components/schemas/DeprecatedRegisterEvalTaskRequest'
         required: true
       deprecated: true
+  /v1/eval/tasks/{task_id}/jobs:
+    post:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Job'
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/DeprecatedRunEvalRequest'
+        required: true
+      deprecated: true
   /v1/datasetio/rows:
     get:
       responses:
@@ -396,31 +515,6 @@ paths:
             schema:
               $ref: '#/components/schemas/EvaluateRowsRequest'
         required: true
-  /v1/eval/tasks/{task_id}/evaluations:
-    post:
-      responses:
-        '200':
-          description: OK
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluateResponse'
-      tags:
-        - Eval
-      description: ''
-      parameters:
-        - name: task_id
-          in: path
-          required: true
-          schema:
-            type: string
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/EvaluateRowsDeprecatedRequest'
-        required: true
-      deprecated: true
   /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}:
     get:
       responses:
@@ -920,51 +1014,6 @@ paths:
           required: true
           schema:
             type: string
-  /v1/eval/tasks/{task_id}/jobs/{job_id}:
-    get:
-      responses:
-        '200':
-          description: OK
-          content:
-            application/json:
-              schema:
-                oneOf:
-                  - $ref: '#/components/schemas/JobStatus'
-                  - type: 'null'
-      tags:
-        - Eval
-      description: ''
-      parameters:
-        - name: task_id
-          in: path
-          required: true
-          schema:
-            type: string
-        - name: job_id
-          in: path
-          required: true
-          schema:
-            type: string
-      deprecated: true
-    delete:
-      responses:
-        '200':
-          description: OK
-      tags:
-        - Eval
-      description: ''
-      parameters:
-        - name: task_id
-          in: path
-          required: true
-          schema:
-            type: string
-        - name: job_id
-          in: path
-          required: true
-          schema:
-            type: string
-      deprecated: true
   /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
     get:
       responses:
@@ -988,30 +1037,6 @@ paths:
           required: true
           schema:
             type: string
-  /v1/eval/tasks/{task_id}/jobs/{job_id}/result:
-    get:
-      responses:
-        '200':
-          description: OK
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluateResponse'
-      tags:
-        - Eval
-      description: ''
-      parameters:
-        - name: task_id
-          in: path
-          required: true
-          schema:
-            type: string
-        - name: job_id
-          in: path
-          required: true
-          schema:
-            type: string
-      deprecated: true
   /v1/eval/benchmarks:
     get:
       responses:
@@ -1446,31 +1471,6 @@ paths:
             schema:
               $ref: '#/components/schemas/RunEvalRequest'
         required: true
-  /v1/eval/tasks/{task_id}/jobs:
-    post:
-      responses:
-        '200':
-          description: OK
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Job'
-      tags:
-        - Eval
-      description: ''
-      parameters:
-        - name: task_id
-          in: path
-          required: true
-          schema:
-            type: string
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RunEvalDeprecatedRequest'
-        required: true
-      deprecated: true
   /v1/safety/run-shield:
     post:
       responses:
@@ -1598,143 +1598,142 @@ jsonSchemaDialect: >-
   https://json-schema.org/draft/2020-12/schema
 components:
   schemas:
-    Benchmark:
+    AgentCandidate:
       type: object
       properties:
-        identifier:
-          type: string
-        provider_resource_id:
-          type: string
-        provider_id:
-          type: string
         type:
           type: string
-          const: benchmark
-          default: benchmark
-        dataset_id:
-          type: string
-        scoring_functions:
-          type: array
-          items:
-            type: string
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
+          const: agent
+          default: agent
+        config:
+          $ref: '#/components/schemas/AgentConfig'
       additionalProperties: false
       required:
-        - identifier
-        - provider_resource_id
-        - provider_id
         - type
-        - dataset_id
-        - scoring_functions
-        - metadata
-    ListBenchmarksResponse:
+        - config
+    AgentConfig:
       type: object
       properties:
-        data:
+        sampling_params:
+          $ref: '#/components/schemas/SamplingParams'
+        input_shields:
           type: array
           items:
-            $ref: '#/components/schemas/Benchmark'
-      additionalProperties: false
-      required:
-        - data
-    DeprecatedRegisterEvalTaskRequest:
-      type: object
-      properties:
-        task_id:
-          type: string
-        dataset_id:
-          type: string
-        scoring_functions:
+            type: string
+        output_shields:
           type: array
           items:
             type: string
-        provider_benchmark_id:
+        toolgroups:
+          type: array
+          items:
+            $ref: '#/components/schemas/AgentTool'
+        client_tools:
+          type: array
+          items:
+            $ref: '#/components/schemas/ToolDef'
+        tool_choice:
           type: string
-        provider_id:
+          enum:
+            - auto
+            - required
+          description: >-
+            Whether tool use is required or automatic. This is a hint to the model
+            which may not be followed. It depends on the Instruction Following capabilities
+            of the model.
+          default: auto
+        tool_prompt_format:
           type: string
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
+          enum:
+            - json
+            - function_tag
+            - python_list
+          description: >-
+            Prompt format for calling custom / zero shot tools.
+        tool_config:
+          $ref: '#/components/schemas/ToolConfig'
+        max_infer_iters:
+          type: integer
+          default: 10
+        model:
+          type: string
+        instructions:
+          type: string
+        enable_session_persistence:
+          type: boolean
+        response_format:
+          $ref: '#/components/schemas/ResponseFormat'
       additionalProperties: false
       required:
-        - task_id
-        - dataset_id
-        - scoring_functions
-    AppendRowsRequest:
+        - model
+        - instructions
+        - enable_session_persistence
+    AgentTool:
+      oneOf:
+        - type: string
+        - type: object
+          properties:
+            name:
+              type: string
+            args:
+              type: object
+              additionalProperties:
+                oneOf:
+                  - type: 'null'
+                  - type: boolean
+                  - type: number
+                  - type: string
+                  - type: array
+                  - type: object
+          additionalProperties: false
+          required:
+            - name
+            - args
+    AggregationFunctionType:
+      type: string
+      enum:
+        - average
+        - median
+        - categorical_count
+        - accuracy
+    BasicScoringFnParams:
       type: object
       properties:
-        dataset_id:
+        type:
           type: string
-        rows:
+          const: basic
+          default: basic
+        aggregation_functions:
           type: array
           items:
-            type: object
-            additionalProperties:
-              oneOf:
-                - type: 'null'
-                - type: boolean
-                - type: number
-                - type: string
-                - type: array
-                - type: object
+            $ref: '#/components/schemas/AggregationFunctionType'
       additionalProperties: false
       required:
-        - dataset_id
-        - rows
-    CompletionMessage:
+        - type
+    BenchmarkConfig:
       type: object
       properties:
-        role:
-          type: string
-          const: assistant
-          default: assistant
-          description: >-
-            Must be "assistant" to identify this as the model's response
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: The content of the model's response
-        stop_reason:
-          type: string
-          enum:
-            - end_of_turn
-            - end_of_message
-            - out_of_tokens
-          description: >-
-            Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`:
-            The model finished generating the entire response. - `StopReason.end_of_message`:
-            The model finished generating but generated a partial response -- usually,
-            a tool call. The user may call the tool and continue the conversation
-            with the tool's response. - `StopReason.out_of_tokens`: The model ran
-            out of token budget.
-        tool_calls:
-          type: array
-          items:
-            $ref: '#/components/schemas/ToolCall'
-          description: >-
-            List of tool calls. Each tool call is a ToolCall object.
+        eval_candidate:
+          $ref: '#/components/schemas/EvalCandidate'
+        scoring_params:
+          type: object
+          additionalProperties:
+            $ref: '#/components/schemas/ScoringFnParams'
+        num_examples:
+          type: integer
       additionalProperties: false
       required:
-        - role
-        - content
-        - stop_reason
-      description: >-
-        A message containing the model's (assistant) response in a chat conversation.
+        - eval_candidate
+        - scoring_params
+    EvalCandidate:
+      oneOf:
+        - $ref: '#/components/schemas/ModelCandidate'
+        - $ref: '#/components/schemas/AgentCandidate'
+      discriminator:
+        propertyName: type
+        mapping:
+          model: '#/components/schemas/ModelCandidate'
+          agent: '#/components/schemas/AgentCandidate'
     GrammarResponseFormat:
       type: object
       properties:
@@ -1845,56 +1844,113 @@ components:
         - json_schema
       description: >-
         Configuration for JSON schema-guided response generation.
-    Message:
-      oneOf:
-        - $ref: '#/components/schemas/UserMessage'
-        - $ref: '#/components/schemas/SystemMessage'
-        - $ref: '#/components/schemas/ToolResponseMessage'
-        - $ref: '#/components/schemas/CompletionMessage'
-      discriminator:
-        propertyName: role
-        mapping:
-          user: '#/components/schemas/UserMessage'
-          system: '#/components/schemas/SystemMessage'
-          tool: '#/components/schemas/ToolResponseMessage'
-          assistant: '#/components/schemas/CompletionMessage'
-    ResponseFormat:
-      oneOf:
-        - $ref: '#/components/schemas/JsonSchemaResponseFormat'
-        - $ref: '#/components/schemas/GrammarResponseFormat'
-      discriminator:
-        propertyName: type
-        mapping:
-          json_schema: '#/components/schemas/JsonSchemaResponseFormat'
-          grammar: '#/components/schemas/GrammarResponseFormat'
-    SamplingParams:
+    LLMAsJudgeScoringFnParams:
       type: object
       properties:
-        strategy:
-          $ref: '#/components/schemas/SamplingStrategy'
-        max_tokens:
-          type: integer
-          default: 0
-        repetition_penalty:
-          type: number
-          default: 1.0
+        type:
+          type: string
+          const: llm_as_judge
+          default: llm_as_judge
+        judge_model:
+          type: string
+        prompt_template:
+          type: string
+        judge_score_regexes:
+          type: array
+          items:
+            type: string
+        aggregation_functions:
+          type: array
+          items:
+            $ref: '#/components/schemas/AggregationFunctionType'
       additionalProperties: false
       required:
-        - strategy
-    SamplingStrategy:
-      oneOf:
-        - $ref: '#/components/schemas/GreedySamplingStrategy'
-        - $ref: '#/components/schemas/TopPSamplingStrategy'
-        - $ref: '#/components/schemas/TopKSamplingStrategy'
-      discriminator:
-        propertyName: type
-        mapping:
-          greedy: '#/components/schemas/GreedySamplingStrategy'
-          top_p: '#/components/schemas/TopPSamplingStrategy'
-          top_k: '#/components/schemas/TopKSamplingStrategy'
-    SystemMessage:
-      type: object
-      properties:
+        - type
+        - judge_model
+    ModelCandidate:
+      type: object
+      properties:
+        type:
+          type: string
+          const: model
+          default: model
+        model:
+          type: string
+        sampling_params:
+          $ref: '#/components/schemas/SamplingParams'
+        system_message:
+          $ref: '#/components/schemas/SystemMessage'
+      additionalProperties: false
+      required:
+        - type
+        - model
+        - sampling_params
+    RegexParserScoringFnParams:
+      type: object
+      properties:
+        type:
+          type: string
+          const: regex_parser
+          default: regex_parser
+        parsing_regexes:
+          type: array
+          items:
+            type: string
+        aggregation_functions:
+          type: array
+          items:
+            $ref: '#/components/schemas/AggregationFunctionType'
+      additionalProperties: false
+      required:
+        - type
+    ResponseFormat:
+      oneOf:
+        - $ref: '#/components/schemas/JsonSchemaResponseFormat'
+        - $ref: '#/components/schemas/GrammarResponseFormat'
+      discriminator:
+        propertyName: type
+        mapping:
+          json_schema: '#/components/schemas/JsonSchemaResponseFormat'
+          grammar: '#/components/schemas/GrammarResponseFormat'
+    SamplingParams:
+      type: object
+      properties:
+        strategy:
+          $ref: '#/components/schemas/SamplingStrategy'
+        max_tokens:
+          type: integer
+          default: 0
+        repetition_penalty:
+          type: number
+          default: 1.0
+      additionalProperties: false
+      required:
+        - strategy
+    SamplingStrategy:
+      oneOf:
+        - $ref: '#/components/schemas/GreedySamplingStrategy'
+        - $ref: '#/components/schemas/TopPSamplingStrategy'
+        - $ref: '#/components/schemas/TopKSamplingStrategy'
+      discriminator:
+        propertyName: type
+        mapping:
+          greedy: '#/components/schemas/GreedySamplingStrategy'
+          top_p: '#/components/schemas/TopPSamplingStrategy'
+          top_k: '#/components/schemas/TopKSamplingStrategy'
+    ScoringFnParams:
+      oneOf:
+        - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
+        - $ref: '#/components/schemas/RegexParserScoringFnParams'
+        - $ref: '#/components/schemas/BasicScoringFnParams'
+      discriminator:
+        propertyName: type
+        mapping:
+          llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
+          regex_parser: '#/components/schemas/RegexParserScoringFnParams'
+          basic: '#/components/schemas/BasicScoringFnParams'
+    SystemMessage:
+      type: object
+      properties:
         role:
           type: string
           const: system
@@ -1921,15 +1977,392 @@ components:
           const: text
           default: text
           description: >-
-            Discriminator type of the content item. Always "text"
-        text:
-          type: string
-          description: Text content
+            Discriminator type of the content item. Always "text"
+        text:
+          type: string
+          description: Text content
+      additionalProperties: false
+      required:
+        - type
+        - text
+      description: A text content item
+    ToolConfig:
+      type: object
+      properties:
+        tool_choice:
+          type: string
+          enum:
+            - auto
+            - required
+          description: >-
+            (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
+          default: auto
+        tool_prompt_format:
+          type: string
+          enum:
+            - json
+            - function_tag
+            - python_list
+          description: >-
+            (Optional) Instructs the model how to format tool calls. By default, Llama
+            Stack will attempt to use a format that is best adapted to the model.
+            - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
+            - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name>
+            tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
+            syntax -- a list of function calls.
+        system_message_behavior:
+          type: string
+          enum:
+            - append
+            - replace
+          description: >-
+            (Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`:
+            Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`:
+            Replaces the default system prompt with the provided system message. The
+            system message can include the string '{{function_definitions}}' to indicate
+            where the function definitions should be inserted.
+          default: append
+      additionalProperties: false
+      required:
+        - system_message_behavior
+      description: Configuration for tool use.
+    ToolDef:
+      type: object
+      properties:
+        name:
+          type: string
+        description:
+          type: string
+        parameters:
+          type: array
+          items:
+            $ref: '#/components/schemas/ToolParameter'
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - name
+    ToolParameter:
+      type: object
+      properties:
+        name:
+          type: string
+        parameter_type:
+          type: string
+        description:
+          type: string
+        required:
+          type: boolean
+          default: true
+        default:
+          oneOf:
+            - type: 'null'
+            - type: boolean
+            - type: number
+            - type: string
+            - type: array
+            - type: object
+      additionalProperties: false
+      required:
+        - name
+        - parameter_type
+        - description
+        - required
+    TopKSamplingStrategy:
+      type: object
+      properties:
+        type:
+          type: string
+          const: top_k
+          default: top_k
+        top_k:
+          type: integer
+      additionalProperties: false
+      required:
+        - type
+        - top_k
+    TopPSamplingStrategy:
+      type: object
+      properties:
+        type:
+          type: string
+          const: top_p
+          default: top_p
+        temperature:
+          type: number
+        top_p:
+          type: number
+          default: 0.95
+      additionalProperties: false
+      required:
+        - type
+    URL:
+      type: object
+      properties:
+        uri:
+          type: string
+      additionalProperties: false
+      required:
+        - uri
+    DeprecatedEvaluateRowsRequest:
+      type: object
+      properties:
+        input_rows:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+        scoring_functions:
+          type: array
+          items:
+            type: string
+        task_config:
+          $ref: '#/components/schemas/BenchmarkConfig'
+      additionalProperties: false
+      required:
+        - input_rows
+        - scoring_functions
+        - task_config
+    EvaluateResponse:
+      type: object
+      properties:
+        generations:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+        scores:
+          type: object
+          additionalProperties:
+            $ref: '#/components/schemas/ScoringResult'
+      additionalProperties: false
+      required:
+        - generations
+        - scores
+    ScoringResult:
+      type: object
+      properties:
+        score_rows:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+        aggregated_results:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - score_rows
+        - aggregated_results
+    Benchmark:
+      type: object
+      properties:
+        identifier:
+          type: string
+        provider_resource_id:
+          type: string
+        provider_id:
+          type: string
+        type:
+          type: string
+          const: benchmark
+          default: benchmark
+        dataset_id:
+          type: string
+        scoring_functions:
+          type: array
+          items:
+            type: string
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - identifier
+        - provider_resource_id
+        - provider_id
+        - type
+        - dataset_id
+        - scoring_functions
+        - metadata
+    JobStatus:
+      type: string
+      enum:
+        - completed
+        - in_progress
+        - failed
+        - scheduled
+    ListBenchmarksResponse:
+      type: object
+      properties:
+        data:
+          type: array
+          items:
+            $ref: '#/components/schemas/Benchmark'
+      additionalProperties: false
+      required:
+        - data
+    DeprecatedRegisterEvalTaskRequest:
+      type: object
+      properties:
+        task_id:
+          type: string
+        dataset_id:
+          type: string
+        scoring_functions:
+          type: array
+          items:
+            type: string
+        provider_benchmark_id:
+          type: string
+        provider_id:
+          type: string
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - task_id
+        - dataset_id
+        - scoring_functions
+    DeprecatedRunEvalRequest:
+      type: object
+      properties:
+        task_config:
+          $ref: '#/components/schemas/BenchmarkConfig'
+      additionalProperties: false
+      required:
+        - task_config
+    Job:
+      type: object
+      properties:
+        job_id:
+          type: string
+      additionalProperties: false
+      required:
+        - job_id
+    AppendRowsRequest:
+      type: object
+      properties:
+        dataset_id:
+          type: string
+        rows:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+      additionalProperties: false
+      required:
+        - dataset_id
+        - rows
+    CompletionMessage:
+      type: object
+      properties:
+        role:
+          type: string
+          const: assistant
+          default: assistant
+          description: >-
+            Must be "assistant" to identify this as the model's response
+        content:
+          $ref: '#/components/schemas/InterleavedContent'
+          description: The content of the model's response
+        stop_reason:
+          type: string
+          enum:
+            - end_of_turn
+            - end_of_message
+            - out_of_tokens
+          description: >-
+            Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`:
+            The model finished generating the entire response. - `StopReason.end_of_message`:
+            The model finished generating but generated a partial response -- usually,
+            a tool call. The user may call the tool and continue the conversation
+            with the tool's response. - `StopReason.out_of_tokens`: The model ran
+            out of token budget.
+        tool_calls:
+          type: array
+          items:
+            $ref: '#/components/schemas/ToolCall'
+          description: >-
+            List of tool calls. Each tool call is a ToolCall object.
       additionalProperties: false
       required:
-        - type
-        - text
-      description: A text content item
+        - role
+        - content
+        - stop_reason
+      description: >-
+        A message containing the model's (assistant) response in a chat conversation.
+    Message:
+      oneOf:
+        - $ref: '#/components/schemas/UserMessage'
+        - $ref: '#/components/schemas/SystemMessage'
+        - $ref: '#/components/schemas/ToolResponseMessage'
+        - $ref: '#/components/schemas/CompletionMessage'
+      discriminator:
+        propertyName: role
+        mapping:
+          user: '#/components/schemas/UserMessage'
+          system: '#/components/schemas/SystemMessage'
+          tool: '#/components/schemas/ToolResponseMessage'
+          assistant: '#/components/schemas/CompletionMessage'
     ToolCall:
       type: object
       properties:
@@ -2050,42 +2483,6 @@ components:
         - content
       description: >-
         A message representing the result of a tool invocation.
-    TopKSamplingStrategy:
-      type: object
-      properties:
-        type:
-          type: string
-          const: top_k
-          default: top_k
-        top_k:
-          type: integer
-      additionalProperties: false
-      required:
-        - type
-        - top_k
-    TopPSamplingStrategy:
-      type: object
-      properties:
-        type:
-          type: string
-          const: top_p
-          default: top_p
-        temperature:
-          type: number
-        top_p:
-          type: number
-          default: 0.95
-      additionalProperties: false
-      required:
-        - type
-    URL:
-      type: object
-      properties:
-        uri:
-          type: string
-      additionalProperties: false
-      required:
-        - uri
     UserMessage:
       type: object
       properties:
@@ -2266,46 +2663,6 @@ components:
       additionalProperties: false
       required:
         - job_uuid
-    ToolConfig:
-      type: object
-      properties:
-        tool_choice:
-          type: string
-          enum:
-            - auto
-            - required
-          description: >-
-            (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
-          default: auto
-        tool_prompt_format:
-          type: string
-          enum:
-            - json
-            - function_tag
-            - python_list
-          description: >-
-            (Optional) Instructs the model how to format tool calls. By default, Llama
-            Stack will attempt to use a format that is best adapted to the model.
-            - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
-            - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name>
-            tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
-            syntax -- a list of function calls.
-        system_message_behavior:
-          type: string
-          enum:
-            - append
-            - replace
-          description: >-
-            (Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`:
-            Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`:
-            Replaces the default system prompt with the provided system message. The
-            system message can include the string '{{function_definitions}}' to indicate
-            where the function definitions should be inserted.
-          default: append
-      additionalProperties: false
-      required:
-        - system_message_behavior
-      description: Configuration for tool use.
     ChatCompletionRequest:
       type: object
       properties:
@@ -2528,161 +2885,33 @@ components:
       additionalProperties: false
       required:
         - model_id
-        - content
-    CompletionResponseStreamChunk:
-      type: object
-      properties:
-        delta:
-          type: string
-          description: >-
-            New content generated since last chunk. This can be one or more tokens.
-        stop_reason:
-          type: string
-          enum:
-            - end_of_turn
-            - end_of_message
-            - out_of_tokens
-          description: >-
-            Optional reason why generation stopped, if complete
-        logprobs:
-          type: array
-          items:
-            $ref: '#/components/schemas/TokenLogProbs'
-          description: >-
-            Optional log probabilities for generated tokens
-      additionalProperties: false
-      required:
-        - delta
-      description: >-
-        A chunk of a streamed completion response.
-    AgentConfig:
-      type: object
-      properties:
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-        input_shields:
-          type: array
-          items:
-            type: string
-        output_shields:
-          type: array
-          items:
-            type: string
-        toolgroups:
-          type: array
-          items:
-            $ref: '#/components/schemas/AgentTool'
-        client_tools:
-          type: array
-          items:
-            $ref: '#/components/schemas/ToolDef'
-        tool_choice:
-          type: string
-          enum:
-            - auto
-            - required
-          description: >-
-            Whether tool use is required or automatic. This is a hint to the model
-            which may not be followed. It depends on the Instruction Following capabilities
-            of the model.
-          default: auto
-        tool_prompt_format:
-          type: string
-          enum:
-            - json
-            - function_tag
-            - python_list
-          description: >-
-            Prompt format for calling custom / zero shot tools.
-        tool_config:
-          $ref: '#/components/schemas/ToolConfig'
-        max_infer_iters:
-          type: integer
-          default: 10
-        model:
-          type: string
-        instructions:
-          type: string
-        enable_session_persistence:
-          type: boolean
-        response_format:
-          $ref: '#/components/schemas/ResponseFormat'
-      additionalProperties: false
-      required:
-        - model
-        - instructions
-        - enable_session_persistence
-    AgentTool:
-      oneOf:
-        - type: string
-        - type: object
-          properties:
-            name:
-              type: string
-            args:
-              type: object
-              additionalProperties:
-                oneOf:
-                  - type: 'null'
-                  - type: boolean
-                  - type: number
-                  - type: string
-                  - type: array
-                  - type: object
-          additionalProperties: false
-          required:
-            - name
-            - args
-    ToolDef:
-      type: object
-      properties:
-        name:
-          type: string
-        description:
-          type: string
-        parameters:
-          type: array
-          items:
-            $ref: '#/components/schemas/ToolParameter'
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-      additionalProperties: false
-      required:
-        - name
-    ToolParameter:
-      type: object
-      properties:
-        name:
-          type: string
-        parameter_type:
-          type: string
-        description:
-          type: string
-        required:
-          type: boolean
-          default: true
-        default:
-          oneOf:
-            - type: 'null'
-            - type: boolean
-            - type: number
-            - type: string
-            - type: array
-            - type: object
+        - content
+    CompletionResponseStreamChunk:
+      type: object
+      properties:
+        delta:
+          type: string
+          description: >-
+            New content generated since last chunk. This can be one or more tokens.
+        stop_reason:
+          type: string
+          enum:
+            - end_of_turn
+            - end_of_message
+            - out_of_tokens
+          description: >-
+            Optional reason why generation stopped, if complete
+        logprobs:
+          type: array
+          items:
+            $ref: '#/components/schemas/TokenLogProbs'
+          description: >-
+            Optional log probabilities for generated tokens
       additionalProperties: false
       required:
-        - name
-        - parameter_type
-        - description
-        - required
+        - delta
+      description: >-
+        A chunk of a streamed completion response.
     CreateAgentRequest:
       type: object
       properties:
@@ -3162,134 +3391,6 @@ components:
         - embeddings
       description: >-
         Response containing generated embeddings.
-    AgentCandidate:
-      type: object
-      properties:
-        type:
-          type: string
-          const: agent
-          default: agent
-        config:
-          $ref: '#/components/schemas/AgentConfig'
-      additionalProperties: false
-      required:
-        - type
-        - config
-    AggregationFunctionType:
-      type: string
-      enum:
-        - average
-        - median
-        - categorical_count
-        - accuracy
-    BasicScoringFnParams:
-      type: object
-      properties:
-        type:
-          type: string
-          const: basic
-          default: basic
-        aggregation_functions:
-          type: array
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-      additionalProperties: false
-      required:
-        - type
-    BenchmarkConfig:
-      type: object
-      properties:
-        eval_candidate:
-          $ref: '#/components/schemas/EvalCandidate'
-        scoring_params:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringFnParams'
-        num_examples:
-          type: integer
-      additionalProperties: false
-      required:
-        - eval_candidate
-        - scoring_params
-    EvalCandidate:
-      oneOf:
-        - $ref: '#/components/schemas/ModelCandidate'
-        - $ref: '#/components/schemas/AgentCandidate'
-      discriminator:
-        propertyName: type
-        mapping:
-          model: '#/components/schemas/ModelCandidate'
-          agent: '#/components/schemas/AgentCandidate'
-    LLMAsJudgeScoringFnParams:
-      type: object
-      properties:
-        type:
-          type: string
-          const: llm_as_judge
-          default: llm_as_judge
-        judge_model:
-          type: string
-        prompt_template:
-          type: string
-        judge_score_regexes:
-          type: array
-          items:
-            type: string
-        aggregation_functions:
-          type: array
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-      additionalProperties: false
-      required:
-        - type
-        - judge_model
-    ModelCandidate:
-      type: object
-      properties:
-        type:
-          type: string
-          const: model
-          default: model
-        model:
-          type: string
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-        system_message:
-          $ref: '#/components/schemas/SystemMessage'
-      additionalProperties: false
-      required:
-        - type
-        - model
-        - sampling_params
-    RegexParserScoringFnParams:
-      type: object
-      properties:
-        type:
-          type: string
-          const: regex_parser
-          default: regex_parser
-        parsing_regexes:
-          type: array
-          items:
-            type: string
-        aggregation_functions:
-          type: array
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-      additionalProperties: false
-      required:
-        - type
-    ScoringFnParams:
-      oneOf:
-        - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-        - $ref: '#/components/schemas/RegexParserScoringFnParams'
-        - $ref: '#/components/schemas/BasicScoringFnParams'
-      discriminator:
-        propertyName: type
-        mapping:
-          llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-          regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-          basic: '#/components/schemas/BasicScoringFnParams'
     EvaluateRowsRequest:
       type: object
       properties:
@@ -3316,84 +3417,6 @@ components:
         - input_rows
         - scoring_functions
         - task_config
-    EvaluateResponse:
-      type: object
-      properties:
-        generations:
-          type: array
-          items:
-            type: object
-            additionalProperties:
-              oneOf:
-                - type: 'null'
-                - type: boolean
-                - type: number
-                - type: string
-                - type: array
-                - type: object
-        scores:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-      additionalProperties: false
-      required:
-        - generations
-        - scores
-    ScoringResult:
-      type: object
-      properties:
-        score_rows:
-          type: array
-          items:
-            type: object
-            additionalProperties:
-              oneOf:
-                - type: 'null'
-                - type: boolean
-                - type: number
-                - type: string
-                - type: array
-                - type: object
-        aggregated_results:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-      additionalProperties: false
-      required:
-        - score_rows
-        - aggregated_results
-    EvaluateRowsDeprecatedRequest:
-      type: object
-      properties:
-        input_rows:
-          type: array
-          items:
-            type: object
-            additionalProperties:
-              oneOf:
-                - type: 'null'
-                - type: boolean
-                - type: number
-                - type: string
-                - type: array
-                - type: object
-        scoring_functions:
-          type: array
-          items:
-            type: string
-        task_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-      additionalProperties: false
-      required:
-        - input_rows
-        - scoring_functions
-        - task_config
     Session:
       type: object
       properties:
@@ -3925,13 +3948,6 @@ components:
         - job_uuid
         - checkpoints
       description: Artifacts of a finetuning job.
-    JobStatus:
-      type: string
-      enum:
-        - completed
-        - in_progress
-        - failed
-        - scheduled
     PostTrainingJobStatusResponse:
       type: object
       properties:
@@ -4935,22 +4951,6 @@ components:
       additionalProperties: false
       required:
         - task_config
-    Job:
-      type: object
-      properties:
-        job_id:
-          type: string
-      additionalProperties: false
-      required:
-        - job_id
-    RunEvalDeprecatedRequest:
-      type: object
-      properties:
-        task_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-      additionalProperties: false
-      required:
-        - task_config
     RunShieldRequest:
       type: object
       properties:
diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
index 599df201a0..857b7f1337 100644
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@@ -1203,7 +1203,7 @@
         ")\n",
         "\n",
         "response = client.eval.evaluate_rows(\n",
-        "    benchmark_id=\"meta-reference::simpleqa\",\n",
+        "    task_id=\"meta-reference::simpleqa\",\n",
         "    input_rows=eval_rows.rows,\n",
         "    scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
         "    task_config={\n",
@@ -1214,7 +1214,7 @@
         "            \"sampling_params\": {\n",
         "                \"strategy\": {\n",
         "                    \"type\": \"greedy\",\n",
-        "                },\n",
+        "                },b\n",
         "                \"max_tokens\": 4096,\n",
         "                \"repeat_penalty\": 1.0,\n",
         "            },\n",
diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py
index b805e49762..010189cc7c 100644
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@@ -85,14 +85,14 @@ async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ...
     async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ...
 
     @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
-    async def run_eval_DEPRECATED(
+    async def DEPRECATED_run_eval(
         self,
         task_id: str,
         task_config: BenchmarkConfig,
     ) -> Job: ...
 
     @webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST")
-    async def evaluate_rows_DEPRECATED(
+    async def DEPRECATED_evaluate_rows(
         self,
         task_id: str,
         input_rows: List[Dict[str, Any]],
@@ -101,10 +101,10 @@ async def evaluate_rows_DEPRECATED(
     ) -> EvaluateResponse: ...
 
     @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET")
-    async def job_status_DEPRECATED(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
+    async def DEPRECATED_job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
 
     @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE")
-    async def job_cancel_DEPRECATED(self, task_id: str, job_id: str) -> None: ...
+    async def DEPRECATED_job_cancel(self, task_id: str, job_id: str) -> None: ...
 
     @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET")
-    async def job_result_DEPRECATED(self, task_id: str, job_id: str) -> EvaluateResponse: ...
+    async def DEPRECATED_job_result(self, task_id: str, job_id: str) -> EvaluateResponse: ...
diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py
index 9945ad367b..845010f543 100644
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@@ -395,6 +395,48 @@ async def job_result(
             job_id,
         )
 
+    async def DEPRECATED_run_eval(
+        self,
+        task_id: str,
+        task_config: BenchmarkConfig,
+    ) -> Job:
+        return await self.run_eval(benchmark_id=task_id, task_config=task_config)
+
+    async def DEPRECATED_evaluate_rows(
+        self,
+        task_id: str,
+        input_rows: List[Dict[str, Any]],
+        scoring_functions: List[str],
+        task_config: BenchmarkConfig,
+    ) -> EvaluateResponse:
+        return await self.evaluate_rows(
+            benchmark_id=task_id,
+            input_rows=input_rows,
+            scoring_functions=scoring_functions,
+            task_config=task_config,
+        )
+
+    async def DEPRECATED_job_status(
+        self,
+        task_id: str,
+        job_id: str,
+    ) -> Optional[JobStatus]:
+        return await self.job_status(benchmark_id=task_id, job_id=job_id)
+
+    async def DEPRECATED_job_cancel(
+        self,
+        task_id: str,
+        job_id: str,
+    ) -> None:
+        return await self.job_cancel(benchmark_id=task_id, job_id=job_id)
+
+    async def DEPRECATED_job_result(
+        self,
+        task_id: str,
+        job_id: str,
+    ) -> EvaluateResponse:
+        return await self.job_result(benchmark_id=task_id, job_id=job_id)
+
 
 class ToolRuntimeRouter(ToolRuntime):
     class RagToolImpl(RAGToolRuntime):
diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index 5d2da73372..b16becf1a9 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -471,18 +471,18 @@ async def register_benchmark(
 
     async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse:
         logger.warning("DEPRECATED: Use /eval/benchmarks instead")
-        raise DeprecationWarning("Use /eval/tasks instead")
+        return await self.list_benchmarks()
 
     async def DEPRECATED_get_eval_task(
         self,
-        benchmark_id: str,
+        task_id: str,
     ) -> Optional[Benchmark]:
         logger.warning("DEPRECATED: Use /eval/benchmarks instead")
-        raise DeprecationWarning("Use /eval/tasks instead")
+        return await self.get_benchmark(task_id)
 
     async def DEPRECATED_register_eval_task(
         self,
-        benchmark_id: str,
+        task_id: str,
         dataset_id: str,
         scoring_functions: List[str],
         provider_benchmark_id: Optional[str] = None,
@@ -490,8 +490,8 @@ async def DEPRECATED_register_eval_task(
         metadata: Optional[Dict[str, Any]] = None,
     ) -> None:
         logger.warning("DEPRECATED: Use /eval/benchmarks instead")
-        self.register_benchmark(
-            benchmark_id=benchmark_id,
+        return await self.register_benchmark(
+            benchmark_id=task_id,
             dataset_id=dataset_id,
             scoring_functions=scoring_functions,
             metadata=metadata,
diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py
index a02418e741..3ae530a47a 100644
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@@ -234,3 +234,45 @@ async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
             raise ValueError(f"Job is not completed, Status: {status.value}")
 
         return self.jobs[job_id]
+
+    async def DEPRECATED_run_eval(
+        self,
+        task_id: str,
+        task_config: BenchmarkConfig,
+    ) -> Job:
+        return await self.run_eval(benchmark_id=task_id, task_config=task_config)
+
+    async def DEPRECATED_evaluate_rows(
+        self,
+        task_id: str,
+        input_rows: List[Dict[str, Any]],
+        scoring_functions: List[str],
+        task_config: BenchmarkConfig,
+    ) -> EvaluateResponse:
+        return await self.evaluate_rows(
+            benchmark_id=task_id,
+            input_rows=input_rows,
+            scoring_functions=scoring_functions,
+            task_config=task_config,
+        )
+
+    async def DEPRECATED_job_status(
+        self,
+        task_id: str,
+        job_id: str,
+    ) -> Optional[JobStatus]:
+        return await self.job_status(benchmark_id=task_id, job_id=job_id)
+
+    async def DEPRECATED_job_cancel(
+        self,
+        task_id: str,
+        job_id: str,
+    ) -> None:
+        return await self.job_cancel(benchmark_id=task_id, job_id=job_id)
+
+    async def DEPRECATED_job_result(
+        self,
+        task_id: str,
+        job_id: str,
+    ) -> EvaluateResponse:
+        return await self.job_result(benchmark_id=task_id, job_id=job_id)

From 1395de57a6de1499b4dda289535ea39b32b5b418 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Wed, 12 Feb 2025 21:16:47 -0800
Subject: [PATCH 14/31] fix

---
 docs/_static/llama-stack-spec.html               |  4 ++--
 docs/_static/llama-stack-spec.yaml               |  4 ++--
 docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb | 14 +++++++-------
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 381f37f1f0..cba7829a18 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -81,7 +81,7 @@
                 "deprecated": true
             }
         },
-        "/v1/eval-tasks/{benchmark_id}": {
+        "/v1/eval-tasks/{task_id}": {
             "get": {
                 "responses": {
                     "200": {
@@ -109,7 +109,7 @@
                 "parameters": [
                     {
                         "name": "task_id",
-                        "in": "query",
+                        "in": "path",
                         "required": true,
                         "schema": {
                             "type": "string"
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 83bc5483c8..0bc4987764 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -35,7 +35,7 @@ paths:
               $ref: '#/components/schemas/DeprecatedEvaluateRowsRequest'
         required: true
       deprecated: true
-  /v1/eval-tasks/{benchmark_id}:
+  /v1/eval-tasks/{task_id}:
     get:
       responses:
         '200':
@@ -51,7 +51,7 @@ paths:
       description: ''
       parameters:
         - name: task_id
-          in: query
+          in: path
           required: true
           schema:
             type: string
diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
index 857b7f1337..8eecf84abb 100644
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@@ -1017,14 +1017,14 @@
         "    \"content\": SYSTEM_PROMPT_TEMPLATE.format(subject=subset),\n",
         "}\n",
         "\n",
-        "client.benchmarks.register(\n",
-        "    benchmark_id=\"meta-reference::mmmu\",\n",
+        "client.eval_tasks.register(\n",
+        "    eval_task_id=\"meta-reference::mmmu\",\n",
         "    dataset_id=f\"mmmu-{subset}-{split}\",\n",
         "    scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
         ")\n",
         "\n",
         "response = client.eval.evaluate_rows(\n",
-        "    benchmark_id=\"meta-reference::mmmu\",\n",
+        "    task_id=\"meta-reference::mmmu\",\n",
         "    input_rows=eval_rows,\n",
         "    scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
         "    task_config={\n",
@@ -1196,8 +1196,8 @@
         "    provider_id=\"together\",\n",
         ")\n",
         "\n",
-        "client.benchmarks.register(\n",
-        "    benchmark_id=\"meta-reference::simpleqa\",\n",
+        "client.eval_tasks.register(\n",
+        "    eval_task_id=\"meta-reference::simpleqa\",\n",
         "    dataset_id=simpleqa_dataset_id,\n",
         "    scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
         ")\n",
@@ -1214,7 +1214,7 @@
         "            \"sampling_params\": {\n",
         "                \"strategy\": {\n",
         "                    \"type\": \"greedy\",\n",
-        "                },b\n",
+        "                },\n",
         "                \"max_tokens\": 4096,\n",
         "                \"repeat_penalty\": 1.0,\n",
         "            },\n",
@@ -1352,7 +1352,7 @@
         "}\n",
         "\n",
         "response = client.eval.evaluate_rows(\n",
-        "    benchmark_id=\"meta-reference::simpleqa\",\n",
+        "    task_id=\"meta-reference::simpleqa\",\n",
         "    input_rows=eval_rows.rows,\n",
         "    scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
         "    task_config={\n",

From 10e8c964539e332ba45583acbdb41561376258ce Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Wed, 12 Feb 2025 21:18:48 -0800
Subject: [PATCH 15/31] add benchmarks

---
 llama_stack/apis/benchmarks/__init__.py   |  7 ++
 llama_stack/apis/benchmarks/benchmarks.py | 86 +++++++++++++++++++++++
 2 files changed, 93 insertions(+)
 create mode 100644 llama_stack/apis/benchmarks/__init__.py
 create mode 100644 llama_stack/apis/benchmarks/benchmarks.py

diff --git a/llama_stack/apis/benchmarks/__init__.py b/llama_stack/apis/benchmarks/__init__.py
new file mode 100644
index 0000000000..f8f5649570
--- /dev/null
+++ b/llama_stack/apis/benchmarks/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .benchmarks import *  # noqa: F401 F403
diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py
new file mode 100644
index 0000000000..75f2b3d053
--- /dev/null
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@@ -0,0 +1,86 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
+
+from llama_models.schema_utils import json_schema_type, webmethod
+from pydantic import BaseModel, Field
+
+from llama_stack.apis.resource import Resource, ResourceType
+
+
+class CommonBenchmarkFields(BaseModel):
+    dataset_id: str
+    scoring_functions: List[str]
+    metadata: Dict[str, Any] = Field(
+        default_factory=dict,
+        description="Metadata for this evaluation task",
+    )
+
+
+@json_schema_type
+class Benchmark(CommonBenchmarkFields, Resource):
+    type: Literal[ResourceType.benchmark.value] = ResourceType.benchmark.value
+
+    @property
+    def benchmark_id(self) -> str:
+        return self.identifier
+
+    @property
+    def provider_benchmark_id(self) -> str:
+        return self.provider_resource_id
+
+
+class BenchmarkInput(CommonBenchmarkFields, BaseModel):
+    benchmark_id: str
+    provider_id: Optional[str] = None
+    provider_benchmark_id: Optional[str] = None
+
+
+class ListBenchmarksResponse(BaseModel):
+    data: List[Benchmark]
+
+
+@runtime_checkable
+class Benchmarks(Protocol):
+    @webmethod(route="/eval/benchmarks", method="GET")
+    async def list_benchmarks(self) -> ListBenchmarksResponse: ...
+
+    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET")
+    async def get_benchmark(
+        self,
+        benchmark_id: str,
+    ) -> Optional[Benchmark]: ...
+
+    @webmethod(route="/eval/benchmarks", method="POST")
+    async def register_benchmark(
+        self,
+        benchmark_id: str,
+        dataset_id: str,
+        scoring_functions: List[str],
+        provider_benchmark_id: Optional[str] = None,
+        provider_id: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> None: ...
+
+    @webmethod(route="/eval-tasks", method="GET")
+    async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse: ...
+
+    @webmethod(route="/eval-tasks/{task_id}", method="GET")
+    async def DEPRECATED_get_eval_task(
+        self,
+        task_id: str,
+    ) -> Optional[Benchmark]: ...
+
+    @webmethod(route="/eval-tasks", method="POST")
+    async def DEPRECATED_register_eval_task(
+        self,
+        task_id: str,
+        dataset_id: str,
+        scoring_functions: List[str],
+        provider_benchmark_id: Optional[str] = None,
+        provider_id: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> None: ...

From 234fe36d62366569ed4913ba6405401f92976740 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Wed, 12 Feb 2025 21:38:40 -0800
Subject: [PATCH 16/31] fix cli download

---
 docs/_static/llama-stack-spec.html             |  8 ++++----
 docs/_static/llama-stack-spec.yaml             |  8 ++++----
 llama_stack/apis/benchmarks/benchmarks.py      |  4 ++--
 llama_stack/apis/eval/eval.py                  |  1 +
 llama_stack/cli/download.py                    | 14 +++++++-------
 llama_stack/cli/verify_download.py             |  4 ++--
 .../distribution/routers/routing_tables.py     |  8 ++++----
 .../inline/eval/meta_reference/eval.py         |  2 +-
 llama_stack/providers/tests/eval/test_eval.py  | 18 +++++++++---------
 9 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index b4506f5d5d..ea7a8f2100 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -108,8 +108,8 @@
                 "description": "",
                 "parameters": [
                     {
-                        "name": "task_id",
-                        "in": "path",
+                        "name": "eval_task_id",
+                        "in": "query",
                         "required": true,
                         "schema": {
                             "type": "string"
@@ -3726,7 +3726,7 @@
             "DeprecatedRegisterEvalTaskRequest": {
                 "type": "object",
                 "properties": {
-                    "task_id": {
+                    "eval_task_id": {
                         "type": "string"
                     },
                     "dataset_id": {
@@ -3772,7 +3772,7 @@
                 },
                 "additionalProperties": false,
                 "required": [
-                    "task_id",
+                    "eval_task_id",
                     "dataset_id",
                     "scoring_functions"
                 ]
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 6f655939eb..19c646bf98 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -50,8 +50,8 @@ paths:
         - Benchmarks
       description: ''
       parameters:
-        - name: task_id
-          in: path
+        - name: eval_task_id
+          in: query
           required: true
           schema:
             type: string
@@ -2248,7 +2248,7 @@ components:
     DeprecatedRegisterEvalTaskRequest:
       type: object
       properties:
-        task_id:
+        eval_task_id:
           type: string
         dataset_id:
           type: string
@@ -2272,7 +2272,7 @@ components:
               - type: object
       additionalProperties: false
       required:
-        - task_id
+        - eval_task_id
         - dataset_id
         - scoring_functions
     DeprecatedRunEvalRequest:
diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py
index 75f2b3d053..50019b18c7 100644
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@@ -71,13 +71,13 @@ async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse: ...
     @webmethod(route="/eval-tasks/{task_id}", method="GET")
     async def DEPRECATED_get_eval_task(
         self,
-        task_id: str,
+        eval_task_id: str,
     ) -> Optional[Benchmark]: ...
 
     @webmethod(route="/eval-tasks", method="POST")
     async def DEPRECATED_register_eval_task(
         self,
-        task_id: str,
+        eval_task_id: str,
         dataset_id: str,
         scoring_functions: List[str],
         provider_benchmark_id: Optional[str] = None,
diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py
index 010189cc7c..e5c7821503 100644
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@@ -39,6 +39,7 @@ class AgentCandidate(BaseModel):
 
 @json_schema_type
 class BenchmarkConfig(BaseModel):
+    type: Literal["benchmark"] = "benchmark"
     eval_candidate: EvalCandidate
     scoring_params: Dict[str, ScoringFnParams] = Field(
         description="Map between scoring function id and parameters for each scoring function you want to run",
diff --git a/llama_stack/cli/download.py b/llama_stack/cli/download.py
index 7b9b303f48..379ac49caa 100644
--- a/llama_stack/cli/download.py
+++ b/llama_stack/cli/download.py
@@ -105,7 +105,7 @@ class DownloadTask:
     output_file: str
     total_size: int = 0
     downloaded_size: int = 0
-    benchmark_id: Optional[int] = None
+    task_id: Optional[int] = None
     retries: int = 0
     max_retries: int = 3
 
@@ -183,8 +183,8 @@ async def _get_info():
                 )
 
             # Update the progress bar's total size once we know it
-            if task.benchmark_id is not None:
-                self.progress.update(task.benchmark_id, total=task.total_size)
+            if task.task_id is not None:
+                self.progress.update(task.task_id, total=task.total_size)
 
         except httpx.HTTPError as e:
             self.console.print(f"[red]Error getting file info: {str(e)}[/red]")
@@ -207,7 +207,7 @@ async def _download_chunk():
                         file.write(chunk)
                         task.downloaded_size += len(chunk)
                         self.progress.update(
-                            task.benchmark_id,
+                            task.task_id,
                             completed=task.downloaded_size,
                         )
 
@@ -234,7 +234,7 @@ async def download_file(self, task: DownloadTask) -> None:
                 if os.path.exists(task.output_file):
                     if self.verify_file_integrity(task):
                         self.console.print(f"[green]Already downloaded {task.output_file}[/green]")
-                        self.progress.update(task.benchmark_id, completed=task.total_size)
+                        self.progress.update(task.task_id, completed=task.total_size)
                         return
 
                 await self.prepare_download(task)
@@ -258,7 +258,7 @@ async def download_file(self, task: DownloadTask) -> None:
                     raise DownloadError(f"Download failed: {str(e)}") from e
 
         except Exception as e:
-            self.progress.update(task.benchmark_id, description=f"[red]Failed: {task.output_file}[/red]")
+            self.progress.update(task.task_id, description=f"[red]Failed: {task.output_file}[/red]")
             raise DownloadError(f"Download failed for {task.output_file}: {str(e)}") from e
 
     def has_disk_space(self, tasks: List[DownloadTask]) -> bool:
@@ -293,7 +293,7 @@ async def download_all(self, tasks: List[DownloadTask]) -> None:
         with self.progress:
             for task in tasks:
                 desc = f"Downloading {Path(task.output_file).name}"
-                task.benchmark_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size)
+                task.task_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size)
 
             semaphore = asyncio.Semaphore(self.max_concurrent_downloads)
 
diff --git a/llama_stack/cli/verify_download.py b/llama_stack/cli/verify_download.py
index ca72ca5818..47993c3613 100644
--- a/llama_stack/cli/verify_download.py
+++ b/llama_stack/cli/verify_download.py
@@ -82,7 +82,7 @@ def verify_files(model_dir: Path, checksums: Dict[str, str], console: Console) -
     ) as progress:
         for filepath, expected_hash in checksums.items():
             full_path = model_dir / filepath
-            benchmark_id = progress.add_task(f"Verifying {filepath}...", total=None)
+            task_id = progress.add_task(f"Verifying {filepath}...", total=None)
 
             exists = full_path.exists()
             actual_hash = None
@@ -102,7 +102,7 @@ def verify_files(model_dir: Path, checksums: Dict[str, str], console: Console) -
                 )
             )
 
-            progress.remove_task(benchmark_id)
+            progress.remove_task(task_id)
 
     return results
 
diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index b16becf1a9..99c73986ce 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -475,14 +475,14 @@ async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse:
 
     async def DEPRECATED_get_eval_task(
         self,
-        task_id: str,
+        eval_task_id: str,
     ) -> Optional[Benchmark]:
         logger.warning("DEPRECATED: Use /eval/benchmarks instead")
-        return await self.get_benchmark(task_id)
+        return await self.get_benchmark(eval_task_id)
 
     async def DEPRECATED_register_eval_task(
         self,
-        task_id: str,
+        eval_task_id: str,
         dataset_id: str,
         scoring_functions: List[str],
         provider_benchmark_id: Optional[str] = None,
@@ -491,7 +491,7 @@ async def DEPRECATED_register_eval_task(
     ) -> None:
         logger.warning("DEPRECATED: Use /eval/benchmarks instead")
         return await self.register_benchmark(
-            benchmark_id=task_id,
+            benchmark_id=eval_task_id,
             dataset_id=dataset_id,
             scoring_functions=scoring_functions,
             metadata=metadata,
diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py
index 3ae530a47a..ea2acd7bbc 100644
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@@ -205,7 +205,7 @@ async def evaluate_rows(
         # scoring with generated_answer
         score_input_rows = [input_r | generated_r for input_r, generated_r in zip(input_rows, generations)]
 
-        if task_config.type == "app" and task_config.scoring_params is not None:
+        if task_config.scoring_params is not None:
             scoring_functions_dict = {
                 scoring_fn_id: task_config.scoring_params.get(scoring_fn_id, None)
                 for scoring_fn_id in scoring_functions
diff --git a/llama_stack/providers/tests/eval/test_eval.py b/llama_stack/providers/tests/eval/test_eval.py
index 78351a28ef..c2f351aa83 100644
--- a/llama_stack/providers/tests/eval/test_eval.py
+++ b/llama_stack/providers/tests/eval/test_eval.py
@@ -59,14 +59,14 @@ async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model
         scoring_functions = [
             "basic::equality",
         ]
-        task_id = "meta-reference::app_eval"
+        benchmark_id = "meta-reference::app_eval"
         await benchmarks_impl.register_benchmark(
-            benchmark_id=task_id,
+            benchmark_id=benchmark_id,
             dataset_id="test_dataset_for_eval",
             scoring_functions=scoring_functions,
         )
         response = await eval_impl.evaluate_rows(
-            task_id=task_id,
+            benchmark_id=benchmark_id,
             input_rows=rows.rows,
             scoring_functions=scoring_functions,
             task_config=AppBenchmarkConfig(
@@ -105,14 +105,14 @@ async def test_eval_run_eval(self, eval_stack, inference_model, judge_model):
             "basic::subset_of",
         ]
 
-        task_id = "meta-reference::app_eval-2"
+        benchmark_id = "meta-reference::app_eval-2"
         await benchmarks_impl.register_benchmark(
-            benchmark_id=task_id,
+            benchmark_id=benchmark_id,
             dataset_id="test_dataset_for_eval",
             scoring_functions=scoring_functions,
         )
         response = await eval_impl.run_eval(
-            task_id=task_id,
+            benchmark_id=benchmark_id,
             task_config=AppBenchmarkConfig(
                 eval_candidate=ModelCandidate(
                     model=inference_model,
@@ -121,9 +121,9 @@ async def test_eval_run_eval(self, eval_stack, inference_model, judge_model):
             ),
         )
         assert response.job_id == "0"
-        job_status = await eval_impl.job_status(task_id, response.job_id)
+        job_status = await eval_impl.job_status(benchmark_id, response.job_id)
         assert job_status and job_status.value == "completed"
-        eval_response = await eval_impl.job_result(task_id, response.job_id)
+        eval_response = await eval_impl.job_result(benchmark_id, response.job_id)
 
         assert eval_response is not None
         assert len(eval_response.generations) == 5
@@ -171,7 +171,7 @@ async def test_eval_run_benchmark_eval(self, eval_stack, inference_model):
 
         benchmark_id = "meta-reference-mmlu"
         response = await eval_impl.run_eval(
-            task_id=benchmark_id,
+            benchmark_id=benchmark_id,
             task_config=BenchmarkBenchmarkConfig(
                 eval_candidate=ModelCandidate(
                     model=inference_model,

From 5f5a7b628f5aee9ce3a1559fd68ed67802f733d5 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Wed, 12 Feb 2025 21:39:52 -0800
Subject: [PATCH 17/31] openapi

---
 docs/_static/llama-stack-spec.html | 171 +++++++++++++-------------
 docs/_static/llama-stack-spec.yaml | 185 +----------------------------
 2 files changed, 94 insertions(+), 262 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index ea7a8f2100..b93f6a380a 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -2699,8 +2699,7 @@
                             "auto",
                             "required"
                         ],
-                        "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model.",
-                        "default": "auto"
+                        "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
                     },
                     "tool_prompt_format": {
                         "type": "string",
@@ -2815,6 +2814,11 @@
             "BenchmarkConfig": {
                 "type": "object",
                 "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "benchmark",
+                        "default": "benchmark"
+                    },
                     "eval_candidate": {
                         "$ref": "#/components/schemas/EvalCandidate"
                     },
@@ -2830,6 +2834,7 @@
                 },
                 "additionalProperties": false,
                 "required": [
+                    "type",
                     "eval_candidate",
                     "scoring_params"
                 ]
@@ -3498,17 +3503,7 @@
             "ScoringResult": {
                 "type": "object",
                 "properties": {
-                    "metrics": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/MetricEvent"
-                        }
-                    },
-                    "completion_message": {
-                        "$ref": "#/components/schemas/CompletionMessage",
-                        "description": "The complete response message"
-                    },
-                    "logprobs": {
+                    "score_rows": {
                         "type": "array",
                         "items": {
                             "type": "object",
@@ -3568,75 +3563,7 @@
                     "aggregated_results"
                 ]
             },
-            "MetricEvent": {
-                "type": "object",
-                "properties": {
-                    "trace_id": {
-                        "type": "string"
-                    },
-                    "span_id": {
-                        "type": "string"
-                    },
-                    "timestamp": {
-                        "type": "string",
-                        "format": "date-time"
-                    },
-                    "attributes": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "integer"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "null"
-                                }
-                            ]
-                        }
-                    },
-                    "type": {
-                        "type": "string",
-                        "const": "metric",
-                        "default": "metric"
-                    },
-                    "metric": {
-                        "type": "string"
-                    },
-                    "value": {
-                        "oneOf": [
-                            {
-                                "type": "integer"
-                            },
-                            {
-                                "type": "number"
-                            }
-                        ]
-                    },
-                    "unit": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "trace_id",
-                    "span_id",
-                    "timestamp",
-                    "type",
-                    "metric",
-                    "value",
-                    "unit"
-                ]
-            },
-            "TokenLogProbs": {
+            "Benchmark": {
                 "type": "object",
                 "properties": {
                     "identifier": {
@@ -4225,6 +4152,12 @@
             "ChatCompletionResponse": {
                 "type": "object",
                 "properties": {
+                    "metrics": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/MetricEvent"
+                        }
+                    },
                     "completion_message": {
                         "$ref": "#/components/schemas/CompletionMessage",
                         "description": "The complete response message"
@@ -4243,6 +4176,74 @@
                 ],
                 "description": "Response from a chat completion request."
             },
+            "MetricEvent": {
+                "type": "object",
+                "properties": {
+                    "trace_id": {
+                        "type": "string"
+                    },
+                    "span_id": {
+                        "type": "string"
+                    },
+                    "timestamp": {
+                        "type": "string",
+                        "format": "date-time"
+                    },
+                    "attributes": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "integer"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "null"
+                                }
+                            ]
+                        }
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "metric",
+                        "default": "metric"
+                    },
+                    "metric": {
+                        "type": "string"
+                    },
+                    "value": {
+                        "oneOf": [
+                            {
+                                "type": "integer"
+                            },
+                            {
+                                "type": "number"
+                            }
+                        ]
+                    },
+                    "unit": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "trace_id",
+                    "span_id",
+                    "timestamp",
+                    "type",
+                    "metric",
+                    "value",
+                    "unit"
+                ]
+            },
             "TokenLogProbs": {
                 "type": "object",
                 "properties": {
@@ -4470,6 +4471,12 @@
             "ChatCompletionResponseStreamChunk": {
                 "type": "object",
                 "properties": {
+                    "metrics": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/MetricEvent"
+                        }
+                    },
                     "event": {
                         "$ref": "#/components/schemas/ChatCompletionResponseEvent",
                         "description": "The event containing the new content"
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 19c646bf98..b30025020b 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -1641,7 +1641,6 @@ components:
             Whether tool use is required or automatic. This is a hint to the model
             which may not be followed. It depends on the Instruction Following capabilities
             of the model.
-          default: auto
         tool_prompt_format:
           type: string
           enum:
@@ -1713,6 +1712,10 @@ components:
     BenchmarkConfig:
       type: object
       properties:
+        type:
+          type: string
+          const: benchmark
+          default: benchmark
         eval_candidate:
           $ref: '#/components/schemas/EvalCandidate'
         scoring_params:
@@ -1723,6 +1726,7 @@ components:
           type: integer
       additionalProperties: false
       required:
+        - type
         - eval_candidate
         - scoring_params
     EvalCandidate:
@@ -2960,185 +2964,6 @@ components:
         - delta
       description: >-
         A chunk of a streamed completion response.
-    AgentConfig:
-      type: object
-      properties:
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-        input_shields:
-          type: array
-          items:
-            type: string
-        output_shields:
-          type: array
-          items:
-            type: string
-        toolgroups:
-          type: array
-          items:
-            $ref: '#/components/schemas/AgentTool'
-        client_tools:
-          type: array
-          items:
-            $ref: '#/components/schemas/ToolDef'
-        tool_choice:
-          type: string
-          enum:
-            - auto
-            - required
-          description: >-
-            Whether tool use is required or automatic. This is a hint to the model
-            which may not be followed. It depends on the Instruction Following capabilities
-            of the model.
-        tool_prompt_format:
-          type: string
-          enum:
-            - json
-            - function_tag
-            - python_list
-          description: >-
-            Prompt format for calling custom / zero shot tools.
-        tool_config:
-          $ref: '#/components/schemas/ToolConfig'
-        max_infer_iters:
-          type: integer
-          default: 10
-        model:
-          type: string
-        instructions:
-          type: string
-        enable_session_persistence:
-          type: boolean
-        response_format:
-          $ref: '#/components/schemas/ResponseFormat'
-      additionalProperties: false
-      required:
-        - model
-        - instructions
-        - enable_session_persistence
-    AgentTool:
-      oneOf:
-        - type: string
-        - type: object
-          properties:
-            name:
-              type: string
-            args:
-              type: object
-              additionalProperties:
-                oneOf:
-                  - type: 'null'
-                  - type: boolean
-                  - type: number
-                  - type: string
-                  - type: array
-                  - type: object
-          additionalProperties: false
-          required:
-            - name
-            - args
-    ToolDef:
-      type: object
-      properties:
-        type:
-          type: string
-          const: text
-          default: text
-        text:
-          type: string
-      additionalProperties: false
-      required:
-        - type
-        - text
-    ToolCallDelta:
-      type: object
-      properties:
-        type:
-          type: string
-          const: tool_call
-          default: tool_call
-        tool_call:
-          oneOf:
-            - type: string
-            - $ref: '#/components/schemas/ToolCall'
-        parse_status:
-          type: string
-          enum:
-            - started
-            - in_progress
-            - failed
-            - succeeded
-      additionalProperties: false
-      required:
-        - type
-        - tool_call
-        - parse_status
-    CompletionRequest:
-      type: object
-      properties:
-        model_id:
-          type: string
-          description: >-
-            The identifier of the model to use. The model must be registered with
-            Llama Stack and available via the /models endpoint.
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: The content to generate a completion for
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: >-
-            (Optional) Parameters to control the sampling strategy
-        response_format:
-          $ref: '#/components/schemas/ResponseFormat'
-          description: >-
-            (Optional) Grammar specification for guided (structured) decoding
-        stream:
-          type: boolean
-          description: >-
-            (Optional) If True, generate an SSE event stream of the response. Defaults
-            to False.
-        logprobs:
-          type: object
-          properties:
-            top_k:
-              type: integer
-              default: 0
-              description: >-
-                How many tokens (for each position) to return log probabilities for.
-          additionalProperties: false
-          description: >-
-            (Optional) If specified, log probabilities for each token position will
-            be returned.
-      additionalProperties: false
-      required:
-        - model_id
-        - content
-    CompletionResponseStreamChunk:
-      type: object
-      properties:
-        delta:
-          type: string
-          description: >-
-            New content generated since last chunk. This can be one or more tokens.
-        stop_reason:
-          type: string
-          enum:
-            - end_of_turn
-            - end_of_message
-            - out_of_tokens
-          description: >-
-            Optional reason why generation stopped, if complete
-        logprobs:
-          type: array
-          items:
-            $ref: '#/components/schemas/TokenLogProbs'
-          description: >-
-            Optional log probabilities for generated tokens
-      additionalProperties: false
-      required:
-        - delta
-      description: >-
-        A chunk of a streamed completion response.
     CreateAgentRequest:
       type: object
       properties:

From bd94769c7dde5048cc98735b449339571adba79e Mon Sep 17 00:00:00 2001
From: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
Date: Thu, 13 Feb 2025 01:03:28 -0500
Subject: [PATCH 18/31] feat: support listing all for `llama stack
 list-providers` (#1056)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?
Support listing all for `llama stack list-providers`.

For ease of reading, sort the output rows by type.

Before the change.

```
 llama stack list-providers
usage: llama stack list-providers [-h] {inference,safety,agents,vector_io,datasetio,scoring,eval,post_training,tool_runtime,telemetry}
llama stack list-providers: error: the following arguments are required: api
```

After the change.

```
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| API Type      | Provider Type                    | PIP Package Dependencies                                                         |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| agents        | inline::meta-reference           | matplotlib,pillow,pandas,scikit-learn,aiosqlite,psycopg2-binary,redis            |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| datasetio     | inline::localfs                  | pandas                                                                           |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| datasetio     | remote::huggingface              | datasets                                                                         |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| eval          | inline::meta-reference           |                                                                                  |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| inference     | inline::meta-reference           | accelerate,blobfile,fairscale,torch,torchvision,transformers,zmq,lm-format-      |
|               |                                  | enforcer,sentence-transformers                                                   |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| inference     | inline::meta-reference-quantized | accelerate,blobfile,fairscale,torch,torchvision,transformers,zmq,lm-format-      |
|               |                                  | enforcer,sentence-transformers,fbgemm-gpu,torchao==0.5.0                         |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| inference     | inline::sentence-transformers    | sentence-transformers                                                            |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| inference     | inline::vllm                     | vllm                                                                             |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| inference     | remote::bedrock                  | boto3                                                                            |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| inference     | remote::cerebras                 | cerebras_cloud_sdk                                                               |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| inference     | remote::databricks               | openai                                                                           |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| inference     | remote::fireworks                | fireworks-ai                                                                     |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| inference     | remote::groq                     | groq                                                                             |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| inference     | remote::hf::endpoint             | huggingface_hub,aiohttp                                                          |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| inference     | remote::hf::serverless           | huggingface_hub,aiohttp                                                          |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| inference     | remote::nvidia                   | openai                                                                           |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| inference     | remote::ollama                   | ollama,aiohttp                                                                   |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| inference     | remote::runpod                   | openai                                                                           |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| inference     | remote::sambanova                | openai                                                                           |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| inference     | remote::tgi                      | huggingface_hub,aiohttp                                                          |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| inference     | remote::together                 | together                                                                         |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| inference     | remote::vllm                     | openai                                                                           |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| post_training | inline::torchtune                | torch,torchtune==0.5.0,torchao==0.8.0,numpy                                      |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| safety        | inline::code-scanner             | codeshield                                                                       |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| safety        | inline::llama-guard              |                                                                                  |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| safety        | inline::meta-reference           | transformers,torch --index-url https://download.pytorch.org/whl/cpu              |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| safety        | inline::prompt-guard             | transformers,torch --index-url https://download.pytorch.org/whl/cpu              |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| safety        | remote::bedrock                  | boto3                                                                            |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| scoring       | inline::basic                    |                                                                                  |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| scoring       | inline::braintrust               | autoevals,openai                                                                 |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| scoring       | inline::llm-as-judge             |                                                                                  |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| telemetry     | inline::meta-reference           | opentelemetry-sdk,opentelemetry-exporter-otlp-proto-http                         |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| tool_runtime  | inline::code-interpreter         |                                                                                  |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| tool_runtime  | inline::rag-runtime              |                                                                                  |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| tool_runtime  | remote::bing-search              | requests                                                                         |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| tool_runtime  | remote::brave-search             | requests                                                                         |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| tool_runtime  | remote::model-context-protocol   | mcp                                                                              |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| tool_runtime  | remote::tavily-search            | requests                                                                         |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| tool_runtime  | remote::wolfram-alpha            | requests                                                                         |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| vector_io     | inline::chromadb                 | blobfile,chardet,pypdf,tqdm,numpy,scikit-                                        |
|               |                                  | learn,scipy,nltk,sentencepiece,transformers,torch torchvision --index-url        |
|               |                                  | https://download.pytorch.org/whl/cpu,sentence-transformers --no-deps,chromadb    |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| vector_io     | inline::faiss                    | blobfile,chardet,pypdf,tqdm,numpy,scikit-                                        |
|               |                                  | learn,scipy,nltk,sentencepiece,transformers,torch torchvision --index-url        |
|               |                                  | https://download.pytorch.org/whl/cpu,sentence-transformers --no-deps,faiss-cpu   |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| vector_io     | inline::meta-reference           | blobfile,chardet,pypdf,tqdm,numpy,scikit-                                        |
|               |                                  | learn,scipy,nltk,sentencepiece,transformers,torch torchvision --index-url        |
|               |                                  | https://download.pytorch.org/whl/cpu,sentence-transformers --no-deps,faiss-cpu   |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| vector_io     | remote::chromadb                 | blobfile,chardet,pypdf,tqdm,numpy,scikit-                                        |
|               |                                  | learn,scipy,nltk,sentencepiece,transformers,torch torchvision --index-url        |
|               |                                  | https://download.pytorch.org/whl/cpu,sentence-transformers --no-deps,chromadb-   |
|               |                                  | client                                                                           |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| vector_io     | remote::pgvector                 | blobfile,chardet,pypdf,tqdm,numpy,scikit-                                        |
|               |                                  | learn,scipy,nltk,sentencepiece,transformers,torch torchvision --index-url        |
|               |                                  | https://download.pytorch.org/whl/cpu,sentence-transformers --no-                 |
|               |                                  | deps,psycopg2-binary                                                             |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| vector_io     | remote::qdrant                   | blobfile,chardet,pypdf,tqdm,numpy,scikit-                                        |
|               |                                  | learn,scipy,nltk,sentencepiece,transformers,torch torchvision --index-url        |
|               |                                  | https://download.pytorch.org/whl/cpu,sentence-transformers --no-deps,qdrant-     |
|               |                                  | client                                                                           |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
| vector_io     | remote::weaviate                 | blobfile,chardet,pypdf,tqdm,numpy,scikit-                                        |
|               |                                  | learn,scipy,nltk,sentencepiece,transformers,torch torchvision --index-url        |
|               |                                  | https://download.pytorch.org/whl/cpu,sentence-transformers --no-deps,weaviate-   |
|               |                                  | client                                                                           |
+---------------+----------------------------------+----------------------------------------------------------------------------------+
```

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan

Manually.

[//]: # (## Documentation)

Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
---
 llama_stack/cli/stack/list_providers.py | 26 +++++++++++++++++++------
 llama_stack/cli/table.py                |  7 ++++++-
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/llama_stack/cli/stack/list_providers.py b/llama_stack/cli/stack/list_providers.py
index bd152c9800..bfe11aa2c7 100644
--- a/llama_stack/cli/stack/list_providers.py
+++ b/llama_stack/cli/stack/list_providers.py
@@ -21,15 +21,19 @@ def __init__(self, subparsers: argparse._SubParsersAction):
         self._add_arguments()
         self.parser.set_defaults(func=self._run_providers_list_cmd)
 
-    def _add_arguments(self):
+    @property
+    def providable_apis(self):
         from llama_stack.distribution.distribution import providable_apis
 
-        api_values = [api.value for api in providable_apis()]
+        return [api.value for api in providable_apis()]
+
+    def _add_arguments(self):
         self.parser.add_argument(
             "api",
             type=str,
-            choices=api_values,
-            help="API to list providers for (one of: {})".format(api_values),
+            choices=self.providable_apis,
+            nargs="?",
+            help="API to list providers for. List all if not specified.",
         )
 
     def _run_providers_list_cmd(self, args: argparse.Namespace) -> None:
@@ -37,20 +41,29 @@ def _run_providers_list_cmd(self, args: argparse.Namespace) -> None:
         from llama_stack.distribution.distribution import Api, get_provider_registry
 
         all_providers = get_provider_registry()
-        providers_for_api = all_providers[Api(args.api)]
+        if args.api:
+            providers = [(args.api, all_providers[Api(args.api)])]
+        else:
+            providers = [(k.value, prov) for k, prov in all_providers.items()]
+
+        providers = [p for api, p in providers if api in self.providable_apis]
 
         # eventually, this should query a registry at llama.meta.com/llamastack/distributions
         headers = [
+            "API Type",
             "Provider Type",
             "PIP Package Dependencies",
         ]
 
         rows = []
-        for spec in providers_for_api.values():
+
+        specs = [spec for p in providers for spec in p.values()]
+        for spec in specs:
             if spec.is_sample:
                 continue
             rows.append(
                 [
+                    spec.api.value,
                     spec.provider_type,
                     ",".join(spec.pip_packages),
                 ]
@@ -59,4 +72,5 @@ def _run_providers_list_cmd(self, args: argparse.Namespace) -> None:
             rows,
             headers,
             separate_rows=True,
+            sort_by=(0, 1),
         )
diff --git a/llama_stack/cli/table.py b/llama_stack/cli/table.py
index 50f54852bc..847719f817 100644
--- a/llama_stack/cli/table.py
+++ b/llama_stack/cli/table.py
@@ -6,6 +6,7 @@
 
 import re
 import textwrap
+from typing import Iterable
 
 from termcolor import cprint
 
@@ -39,11 +40,15 @@ def wrap(text, width):
     return "\n".join(lines)
 
 
-def print_table(rows, headers=None, separate_rows: bool = False):
+def print_table(rows, headers=None, separate_rows: bool = False, sort_by: Iterable[int] = tuple()):
     def itemlen(item):
         return max([len(line) for line in strip_ansi_colors(item).split("\n")])
 
     rows = [[x or "" for x in row] for row in rows]
+
+    if sort_by:
+        rows.sort(key=lambda x: tuple(x[i] for i in sort_by))
+
     if not headers:
         col_widths = [max(itemlen(item) for item in col) for col in zip(*rows)]
     else:

From a5d21e6f23d1fa86fff753f752fbb90775142f8f Mon Sep 17 00:00:00 2001
From: Ben Browning <ben324@gmail.com>
Date: Thu, 13 Feb 2025 10:57:30 -0500
Subject: [PATCH 19/31] docs: Mention convential commits format in
 CONTRIBUTING.md (#1075)

# What does this PR do?

This adds a note to ensure pull requests follow the conventional commits
format, along with a link to that format, in CONTRIBUTING.md. One of the
pull-request checks enforces PR titles that match this format, so it's
good to be upfront about this expectation before a new developer opens a
PR.

Signed-off-by: Ben Browning <bbrownin@redhat.com>
---
 CONTRIBUTING.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 8028c194e5..6dc08b5c0a 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -40,6 +40,7 @@ If you need help or guidance, comment on the issue. Issues that are extra friend
 3. Ensure the test suite passes.
 4. Make sure your code lints using `pre-commit`.
 5. If you haven't already, complete the Contributor License Agreement ("CLA").
+6. Ensure your pull request follows the [conventional commits format](https://www.conventionalcommits.org/en/v1.0.0/).
 
 ## Contributor License Agreement ("CLA")
 In order to accept your pull request, we need you to submit a CLA. You only need

From 06c732a008ae877c688b69532bfcead0e3b68a4c Mon Sep 17 00:00:00 2001
From: Ben Browning <ben324@gmail.com>
Date: Thu, 13 Feb 2025 11:00:00 -0500
Subject: [PATCH 20/31] fix: logprobs support in remote-vllm provider (#1074)

# What does this PR do?

The remote-vllm provider was not passing logprobs options from
CompletionRequest or ChatCompletionRequests through to the OpenAI client
parameters. I manually verified this, as well as observed this provider
failing `TestInference::test_completion_logprobs`. This was filed as
issue #1073.

This fixes that by passing the `logprobs.top_k` value through to the
parameters we pass into the OpenAI client.

Additionally, this fixes a bug in `test_text_inference.py` where it
mistakenly assumed chunk.delta were of type `ContentDelta` for
completion requests. The deltas are of type `ContentDelta` for chat
completion requests, but for basic completion requests the deltas are of
type string. This test was likely failing for other providers that did
properly support logprobs because of this latter issue in the test,
which was hit while fixing the above issue with the remote-vllm
provider.

(Closes #1073)

## Test Plan

First, you need a vllm running. I ran one locally like this:
```
vllm serve meta-llama/Llama-3.2-3B-Instruct --port 8001 --enable-auto-tool-choice --tool-call-parser llama3_json
```

Next, run test_text_inference.py against this vllm using the remote vllm
provider like this:
```
VLLM_URL="http://localhost:8001/v1" python -m pytest -s -v llama_stack/providers/tests/inference/test_text_inference.py --providers "inference=vllm_remote"
```

Before my change, the test failed with this error:
```
llama_stack/providers/tests/inference/test_text_inference.py:155: in test_completion_logprobs
    assert 1 <= len(response.logprobs) <= 5
E   TypeError: object of type 'NoneType' has no len()
```

After my change, the test passes.

[//]: # (## Documentation)

Signed-off-by: Ben Browning <bbrownin@redhat.com>
---
 llama_stack/providers/remote/inference/vllm/vllm.py          | 3 +++
 llama_stack/providers/tests/inference/test_text_inference.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py
index 02594891be..3574768b51 100644
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@@ -345,6 +345,9 @@ async def _get_params(self, request: Union[ChatCompletionRequest, CompletionRequ
             else:
                 raise ValueError(f"Unknown response format {fmt.type}")
 
+        if request.logprobs and request.logprobs.top_k:
+            input_dict["logprobs"] = request.logprobs.top_k
+
         return {
             "model": request.model,
             **input_dict,
diff --git a/llama_stack/providers/tests/inference/test_text_inference.py b/llama_stack/providers/tests/inference/test_text_inference.py
index 99f968cbc2..6a72591238 100644
--- a/llama_stack/providers/tests/inference/test_text_inference.py
+++ b/llama_stack/providers/tests/inference/test_text_inference.py
@@ -175,7 +175,7 @@ async def test_completion_logprobs(self, inference_model, inference_stack):
             1 <= len(chunks) <= 6
         )  # why 6 and not 5? the response may have an extra closing chunk, e.g. for usage or stop_reason
         for chunk in chunks:
-            if chunk.delta.type == "text" and chunk.delta.text:  # if there's a token, we expect logprobs
+            if chunk.delta:  # if there's a token, we expect logprobs
                 assert chunk.logprobs, "Logprobs should not be empty"
                 assert all(len(logprob.logprobs_by_token) == 3 for logprob in chunk.logprobs)
             else:  # no token, no logprobs

From 40468aaa14f2a685db9d80d7ef0cd4afd5e7c832 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Thu, 13 Feb 2025 17:07:59 +0100
Subject: [PATCH 21/31] fix: improve signal handling and update dependencies
 (#1044)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?
This commit enhances the signal handling mechanism in the server by
improving the `handle_signal` (previously handle_sigint) function. It
now properly retrieves the signal name, ensuring clearer logging when a
termination signal is received. Additionally, it cancels all running
tasks and waits for their completion before stopping the event loop,
allowing for a more graceful shutdown. Support for handling
SIGTERM has also been added alongside SIGINT.

Before the changes, handle_sigint used asyncio.run(run_shutdown()).
However, asyncio.run() is meant to start a new event loop, and calling
it inside an existing one (like when running Uvicorn) raises an error.
The fix replaces asyncio.run(run_shutdown()) with an async function
scheduled on the existing loop using loop.create_task(shutdown()). This
ensures that the shutdown coroutine runs within the current event loop
instead of trying to create a new one.

Furthermore, this commit updates the project dependencies. `fastapi` and
`uvicorn` have been added to the development dependencies in
`pyproject.toml` and `uv.lock`, ensuring that the necessary packages are
available for development and execution.

Closes: https://github.com/meta-llama/llama-stack/issues/1043
Signed-off-by: Sébastien Han <seb@redhat.com>

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan

Run a server and send SIGINT:

```
INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" python -m llama_stack.distribution.server.server --yaml-config ./llama_stack/templates/ollama/run.yaml
Using config file: llama_stack/templates/ollama/run.yaml
Run configuration:
apis:
- agents
- datasetio
- eval
- inference
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
container_image: null
datasets: []
eval_tasks: []
image_name: ollama
metadata_store:
  db_path: /Users/leseb/.llama/distributions/ollama/registry.db
  namespace: null
  type: sqlite
models:
- metadata: {}
  model_id: meta-llama/Llama-3.2-3B-Instruct
  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType
  - llm
  provider_id: ollama
  provider_model_id: null
- metadata:
    embedding_dimension: 384
  model_id: all-MiniLM-L6-v2
  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType
  - embedding
  provider_id: sentence-transformers
  provider_model_id: null
providers:
  agents:
  - config:
      persistence_store:
        db_path: /Users/leseb/.llama/distributions/ollama/agents_store.db
        namespace: null
        type: sqlite
    provider_id: meta-reference
    provider_type: inline::meta-reference
  datasetio:
  - config: {}
    provider_id: huggingface
    provider_type: remote::huggingface
  - config: {}
    provider_id: localfs
    provider_type: inline::localfs
  eval:
  - config: {}
    provider_id: meta-reference
    provider_type: inline::meta-reference
  inference:
  - config:
      url: http://localhost:11434
    provider_id: ollama
    provider_type: remote::ollama
  - config: {}
    provider_id: sentence-transformers
    provider_type: inline::sentence-transformers
  safety:
  - config: {}
    provider_id: llama-guard
    provider_type: inline::llama-guard
  scoring:
  - config: {}
    provider_id: basic
    provider_type: inline::basic
  - config: {}
    provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
  - config:
      openai_api_key: '********'
    provider_id: braintrust
    provider_type: inline::braintrust
  telemetry:
  - config:
      service_name: llama-stack
      sinks: console,sqlite
      sqlite_db_path: /Users/leseb/.llama/distributions/ollama/trace_store.db
    provider_id: meta-reference
    provider_type: inline::meta-reference
  tool_runtime:
  - config:
      api_key: '********'
      max_results: 3
    provider_id: brave-search
    provider_type: remote::brave-search
  - config:
      api_key: '********'
      max_results: 3
    provider_id: tavily-search
    provider_type: remote::tavily-search
  - config: {}
    provider_id: code-interpreter
    provider_type: inline::code-interpreter
  - config: {}
    provider_id: rag-runtime
    provider_type: inline::rag-runtime
  vector_io:
  - config:
      kvstore:
        db_path: /Users/leseb/.llama/distributions/ollama/faiss_store.db
        namespace: null
        type: sqlite
    provider_id: faiss
    provider_type: inline::faiss
scoring_fns: []
server:
  port: 8321
  tls_certfile: null
  tls_keyfile: null
shields: []
tool_groups:
- args: null
  mcp_endpoint: null
  provider_id: tavily-search
  toolgroup_id: builtin::websearch
- args: null
  mcp_endpoint: null
  provider_id: rag-runtime
  toolgroup_id: builtin::rag
- args: null
  mcp_endpoint: null
  provider_id: code-interpreter
  toolgroup_id: builtin::code_interpreter
vector_dbs: []
version: '2'

INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:213: Resolved 31 providers
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-inference => ollama
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-inference => sentence-transformers
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  models => __routing_table__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inference => __autorouted__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-vector_io => faiss
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-safety => llama-guard
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  shields => __routing_table__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  safety => __autorouted__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  vector_dbs => __routing_table__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  vector_io => __autorouted__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-tool_runtime => brave-search
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-tool_runtime => tavily-search
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-tool_runtime => code-interpreter
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-tool_runtime => rag-runtime
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  tool_groups => __routing_table__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  tool_runtime => __autorouted__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  agents => meta-reference
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-datasetio => huggingface
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-datasetio => localfs
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  datasets => __routing_table__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  datasetio => __autorouted__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  telemetry => meta-reference
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-scoring => basic
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-scoring => llm-as-judge
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-scoring => braintrust
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  scoring_functions => __routing_table__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  scoring => __autorouted__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inner-eval => meta-reference
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  eval_tasks => __routing_table__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  eval => __autorouted__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215:  inspect => __builtin__
INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:216:
INFO 2025-02-12 10:21:03,723 llama_stack.providers.remote.inference.ollama.ollama:148: checking connectivity to Ollama at `http://localhost:11434`...
INFO 2025-02-12 10:21:03,734 httpx:1740: HTTP Request: GET http://localhost:11434/api/ps "HTTP/1.1 200 OK"
INFO 2025-02-12 10:21:03,843 faiss.loader:148: Loading faiss.
INFO 2025-02-12 10:21:03,865 faiss.loader:150: Successfully loaded faiss.
INFO 2025-02-12 10:21:03,868 faiss:173: Failed to load GPU Faiss: name 'GpuIndexIVFFlat' is not defined. Will not load constructor refs for GPU indexes.
Warning: `bwrap` is not available. Code interpreter tool will not work correctly.
INFO 2025-02-12 10:21:04,315 datasets:54: PyTorch version 2.6.0 available.
INFO 2025-02-12 10:21:04,556 httpx:1740: HTTP Request: GET http://localhost:11434/api/ps "HTTP/1.1 200 OK"
INFO 2025-02-12 10:21:04,557 llama_stack.providers.utils.inference.embedding_mixin:42: Loading sentence transformer for all-MiniLM-L6-v2...
INFO 2025-02-12 10:21:07,202 sentence_transformers.SentenceTransformer:210: Use pytorch device_name: mps
INFO 2025-02-12 10:21:07,202 sentence_transformers.SentenceTransformer:218: Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO 2025-02-12 10:21:09,500 llama_stack.distribution.stack:102: Models: all-MiniLM-L6-v2 served by sentence-transformers
INFO 2025-02-12 10:21:09,500 llama_stack.distribution.stack:102: Models: meta-llama/Llama-3.2-3B-Instruct served by ollama
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: basic::equality served by basic
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: basic::regex_parser_multiple_choice_answer served by basic
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: basic::subset_of served by basic
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::answer-correctness served by braintrust
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::answer-relevancy served by braintrust
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::answer-similarity served by braintrust
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::context-entity-recall served by braintrust
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::context-precision served by braintrust
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::context-recall served by braintrust
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::context-relevancy served by braintrust
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::factuality served by braintrust
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::faithfulness served by braintrust
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: llm-as-judge::405b-simpleqa served by llm-as-judge
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: llm-as-judge::base served by llm-as-judge
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Tool_groups: builtin::code_interpreter served by code-interpreter
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Tool_groups: builtin::rag served by rag-runtime
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Tool_groups: builtin::websearch served by tavily-search
INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:106:
Serving API eval
 POST /v1/eval/tasks/{task_id}/evaluations
 DELETE /v1/eval/tasks/{task_id}/jobs/{job_id}
 GET /v1/eval/tasks/{task_id}/jobs/{job_id}/result
 GET /v1/eval/tasks/{task_id}/jobs/{job_id}
 POST /v1/eval/tasks/{task_id}/jobs
Serving API agents
 POST /v1/agents
 POST /v1/agents/{agent_id}/session
 POST /v1/agents/{agent_id}/session/{session_id}/turn
 DELETE /v1/agents/{agent_id}
 DELETE /v1/agents/{agent_id}/session/{session_id}
 GET /v1/agents/{agent_id}/session/{session_id}
 GET /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}
 GET /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}
Serving API scoring_functions
 GET /v1/scoring-functions/{scoring_fn_id}
 GET /v1/scoring-functions
 POST /v1/scoring-functions
Serving API safety
 POST /v1/safety/run-shield
Serving API inspect
 GET /v1/health
 GET /v1/inspect/providers
 GET /v1/inspect/routes
 GET /v1/version
Serving API tool_runtime
 POST /v1/tool-runtime/invoke
 GET /v1/tool-runtime/list-tools
 POST /v1/tool-runtime/rag-tool/insert
 POST /v1/tool-runtime/rag-tool/query
Serving API datasetio
 POST /v1/datasetio/rows
 GET /v1/datasetio/rows
Serving API shields
 GET /v1/shields/{identifier}
 GET /v1/shields
 POST /v1/shields
Serving API eval_tasks
 GET /v1/eval-tasks/{eval_task_id}
 GET /v1/eval-tasks
 POST /v1/eval-tasks
Serving API models
 GET /v1/models/{model_id}
 GET /v1/models
 POST /v1/models
 DELETE /v1/models/{model_id}
Serving API datasets
 GET /v1/datasets/{dataset_id}
 GET /v1/datasets
 POST /v1/datasets
 DELETE /v1/datasets/{dataset_id}
Serving API vector_io
 POST /v1/vector-io/insert
 POST /v1/vector-io/query
Serving API inference
 POST /v1/inference/chat-completion
 POST /v1/inference/completion
 POST /v1/inference/embeddings
Serving API tool_groups
 GET /v1/tools/{tool_name}
 GET /v1/toolgroups/{toolgroup_id}
 GET /v1/toolgroups
 GET /v1/tools
 POST /v1/toolgroups
 DELETE /v1/toolgroups/{toolgroup_id}
Serving API vector_dbs
 GET /v1/vector-dbs/{vector_db_id}
 GET /v1/vector-dbs
 POST /v1/vector-dbs
 DELETE /v1/vector-dbs/{vector_db_id}
Serving API scoring
 POST /v1/scoring/score
 POST /v1/scoring/score-batch
Serving API telemetry
 GET /v1/telemetry/traces/{trace_id}/spans/{span_id}
 GET /v1/telemetry/spans/{span_id}/tree
 GET /v1/telemetry/traces/{trace_id}
 POST /v1/telemetry/events
 GET /v1/telemetry/spans
 GET /v1/telemetry/traces
 POST /v1/telemetry/spans/export

Listening on ['::', '0.0.0.0']:5001
INFO:     Started server process [65372]
INFO:     Waiting for application startup.
INFO:     ASGI 'lifespan' protocol appears unsupported.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://['::', '0.0.0.0']:5001 (Press CTRL+C to quit)
^CINFO:     Shutting down
INFO:     Finished server process [65372]
Received signal SIGINT (2). Exiting gracefully...
INFO 2025-02-12 10:21:11,215 __main__:151: Shutting down ModelsRoutingTable
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down InferenceRouter
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down ShieldsRoutingTable
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down SafetyRouter
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down VectorDBsRoutingTable
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down VectorIORouter
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down ToolGroupsRoutingTable
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down ToolRuntimeRouter
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down MetaReferenceAgentsImpl
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down DatasetsRoutingTable
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down DatasetIORouter
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down TelemetryAdapter
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down ScoringFunctionsRoutingTable
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down ScoringRouter
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down EvalTasksRoutingTable
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down EvalRouter
INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down DistributionInspectImpl
```

[//]: # (## Documentation)
[//]: # (- [ ] Added a Changelog entry if the change is significant)

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 llama_stack/distribution/inspect.py           |  3 +
 .../distribution/routers/routing_tables.py    |  3 +
 llama_stack/distribution/server/server.py     | 77 ++++++++++++++++---
 .../inline/agents/meta_reference/agents.py    |  3 +
 pyproject.toml                                |  2 +
 uv.lock                                       | 18 +++++
 6 files changed, 94 insertions(+), 12 deletions(-)

diff --git a/llama_stack/distribution/inspect.py b/llama_stack/distribution/inspect.py
index b7ee4a219f..fddb625701 100644
--- a/llama_stack/distribution/inspect.py
+++ b/llama_stack/distribution/inspect.py
@@ -82,3 +82,6 @@ async def health(self) -> HealthInfo:
 
     async def version(self) -> VersionInfo:
         return VersionInfo(version=version("llama-stack"))
+
+    async def shutdown(self) -> None:
+        pass
diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index 99c73986ce..ec258af491 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -570,3 +570,6 @@ async def unregister_toolgroup(self, toolgroup_id: str) -> None:
         for tool in tools:
             await self.unregister_object(tool)
         await self.unregister_object(tool_group)
+
+    async def shutdown(self) -> None:
+        pass
diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py
index d2c32de119..bb735268b6 100644
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@@ -7,6 +7,7 @@
 import argparse
 import asyncio
 import functools
+import logging
 import inspect
 import json
 import os
@@ -52,6 +53,9 @@
 
 REPO_ROOT = Path(__file__).parent.parent.parent.parent
 
+logging.basicConfig(level=logging.INFO, format="%(levelname)s %(asctime)s %(name)s:%(lineno)d: %(message)s")
+logger = logging.getLogger(__name__)
+
 
 def warn_with_traceback(message, category, filename, lineno, file=None, line=None):
     log = file if hasattr(file, "write") else sys.stderr
@@ -112,21 +116,69 @@ def translate_exception(exc: Exception) -> Union[HTTPException, RequestValidatio
         )
 
 
-def handle_sigint(app, *args, **kwargs):
-    print("SIGINT or CTRL-C detected. Exiting gracefully...")
+def handle_signal(app, signum, _) -> None:
+    """
+    Handle incoming signals and initiate a graceful shutdown of the application.
+
+    This function is intended to be used as a signal handler for various signals
+    (e.g., SIGINT, SIGTERM). Upon receiving a signal, it will print a message
+    indicating the received signal and initiate a shutdown process.
 
-    async def run_shutdown():
-        for impl in app.__llama_stack_impls__.values():
-            print(f"Shutting down {impl}")
-            await impl.shutdown()
+    Args:
+        app: The application instance containing implementations to be shut down.
+        signum (int): The signal number received.
+        frame: The current stack frame (not used in this function).
 
-    asyncio.run(run_shutdown())
+    The shutdown process involves:
+        - Shutting down all implementations registered in the application.
+        - Gathering all running asyncio tasks.
+        - Cancelling all gathered tasks.
+        - Waiting for all tasks to finish.
+        - Stopping the event loop.
 
-    loop = asyncio.get_event_loop()
-    for task in asyncio.all_tasks(loop):
-        task.cancel()
+    Note:
+        This function schedules the shutdown process as an asyncio task and does
+        not block the current execution.
+    """
+    signame = signal.Signals(signum).name
+    print(f"Received signal {signame} ({signum}). Exiting gracefully...")
+
+    async def shutdown():
+        try:
+            # Gracefully shut down implementations
+            for impl in app.__llama_stack_impls__.values():
+                impl_name = impl.__class__.__name__
+                logger.info("Shutting down %s", impl_name)
+                try:
+                    if hasattr(impl, "shutdown"):
+                        await asyncio.wait_for(impl.shutdown(), timeout=5)
+                    else:
+                        logger.warning("No shutdown method for %s", impl_name)
+                except asyncio.TimeoutError:
+                    logger.exception("Shutdown timeout for %s ", impl_name, exc_info=True)
+                except Exception as e:
+                    logger.exception("Failed to shutdown %s: %s", impl_name, {e})
+
+            # Gather all running tasks
+            loop = asyncio.get_running_loop()
+            tasks = [task for task in asyncio.all_tasks(loop) if task is not asyncio.current_task()]
+
+            # Cancel all tasks
+            for task in tasks:
+                task.cancel()
+
+            # Wait for all tasks to finish
+            try:
+                await asyncio.wait_for(asyncio.gather(*tasks, return_exceptions=True), timeout=10)
+            except asyncio.TimeoutError:
+                logger.exception("Timeout while waiting for tasks to finish")
+        except asyncio.CancelledError:
+            pass
+        finally:
+            loop.stop()
 
-    loop.stop()
+    loop = asyncio.get_running_loop()
+    loop.create_task(shutdown())
 
 
 @asynccontextmanager
@@ -386,7 +438,8 @@ def main():
     print("")
     app.exception_handler(RequestValidationError)(global_exception_handler)
     app.exception_handler(Exception)(global_exception_handler)
-    signal.signal(signal.SIGINT, functools.partial(handle_sigint, app))
+    signal.signal(signal.SIGINT, functools.partial(handle_signal, app))
+    signal.signal(signal.SIGTERM, functools.partial(handle_signal, app))
 
     app.__llama_stack_impls__ = impls
 
diff --git a/llama_stack/providers/inline/agents/meta_reference/agents.py b/llama_stack/providers/inline/agents/meta_reference/agents.py
index fe4ccd1a33..e3c18d1122 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agents.py
@@ -212,3 +212,6 @@ async def delete_agents_session(self, agent_id: str, session_id: str) -> None:
 
     async def delete_agent(self, agent_id: str) -> None:
         await self.persistence_store.delete(f"agent:{agent_id}")
+
+    async def shutdown(self) -> None:
+        pass
diff --git a/pyproject.toml b/pyproject.toml
index 5e9cb75e25..2f40ceac9f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,6 +46,8 @@ dev = [
     "types-requests",
     "types-setuptools",
     "pre-commit",
+    "uvicorn",
+    "fastapi",
 ]
 docs = [
     "sphinx-autobuild",
diff --git a/uv.lock b/uv.lock
index 087396eeac..97ae521244 100644
--- a/uv.lock
+++ b/uv.lock
@@ -431,6 +431,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/7b/8f/c4d9bafc34ad7ad5d8dc16dd1347ee0e507a52c3adb6bfa8887e1c6a26ba/executing-2.2.0-py2.py3-none-any.whl", hash = "sha256:11387150cad388d62750327a53d3339fad4888b39a6fe233c3afbb54ecffd3aa", size = 26702 },
 ]
 
+[[package]]
+name = "fastapi"
+version = "0.115.8"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pydantic" },
+    { name = "starlette" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a2/b2/5a5dc4affdb6661dea100324e19a7721d5dc524b464fe8e366c093fd7d87/fastapi-0.115.8.tar.gz", hash = "sha256:0ce9111231720190473e222cdf0f07f7206ad7e53ea02beb1d2dc36e2f0741e9", size = 295403 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8f/7d/2d6ce181d7a5f51dedb8c06206cbf0ec026a99bf145edd309f9e17c3282f/fastapi-0.115.8-py3-none-any.whl", hash = "sha256:753a96dd7e036b34eeef8babdfcfe3f28ff79648f86551eb36bfc1b0bf4a8cbf", size = 94814 },
+]
+
 [[package]]
 name = "fastjsonschema"
 version = "2.21.1"
@@ -724,6 +738,7 @@ dependencies = [
 [package.optional-dependencies]
 dev = [
     { name = "black" },
+    { name = "fastapi" },
     { name = "nbval" },
     { name = "pre-commit" },
     { name = "pytest" },
@@ -731,6 +746,7 @@ dev = [
     { name = "ruff" },
     { name = "types-requests" },
     { name = "types-setuptools" },
+    { name = "uvicorn" },
 ]
 docs = [
     { name = "myst-parser" },
@@ -748,6 +764,7 @@ docs = [
 requires-dist = [
     { name = "black", marker = "extra == 'dev'" },
     { name = "blobfile" },
+    { name = "fastapi", marker = "extra == 'dev'" },
     { name = "fire" },
     { name = "httpx" },
     { name = "huggingface-hub" },
@@ -776,6 +793,7 @@ requires-dist = [
     { name = "termcolor" },
     { name = "types-requests", marker = "extra == 'dev'" },
     { name = "types-setuptools", marker = "extra == 'dev'" },
+    { name = "uvicorn", marker = "extra == 'dev'" },
 ]
 
 [[package]]

From b56465738dc8433b9131bf08956b23381aa83e3d Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Fri, 14 Feb 2025 00:33:11 +0800
Subject: [PATCH 22/31] style: update model id in model list title (#1072)

# What does this PR do?
[Provide a short summary of what this PR does and why. Link to relevant
issues if applicable.]

Since the subcommands used `MODEL_ID`, it would be better to use it in
`model list` and make it easy to find it.
```
$ llama model verify-download --help
usage: llama model verify-download [-h] --model-id MODEL_ID <<

$ llama model describe --help
usage: llama model describe [-h] -m MODEL_ID  <<

$ llama download --help
--model-id MODEL_ID   See `llama model list` or `llama model list --show-all` for the list of available models


before:
$ llama model list
+-----------------------------------------+-----------------------------------------------------+----------------+
| Model Descriptor                        | Hugging Face Repo                                   | Context Length |
+-----------------------------------------+-----------------------------------------------------+----------------+

after:
$ llama model list
+-----------------------------------------+-----------------------------------------------------+----------------+
| Model Descriptor                        | Model ID                                            | Context Length |
+-----------------------------------------+-----------------------------------------------------+----------------+
| Llama3.1-8B                             | meta-llama/Llama-3.1-8B                             | 128K           |
+-----------------------------------------+-----------------------------------------------------+----------------+
```

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]

[//]: # (## Documentation)

Signed-off-by: reidliu <reid201711@gmail.com>
Co-authored-by: reidliu <reid201711@gmail.com>
---
 llama_stack/cli/model/list.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_stack/cli/model/list.py b/llama_stack/cli/model/list.py
index 6d296e75e1..9b5ebb1a5f 100644
--- a/llama_stack/cli/model/list.py
+++ b/llama_stack/cli/model/list.py
@@ -38,7 +38,7 @@ def _run_model_list_cmd(self, args: argparse.Namespace) -> None:
 
         headers = [
             "Model Descriptor",
-            "Hugging Face Repo",
+            "Model ID",
             "Context Length",
         ]
 

From b8a612e5bd83e9e367cef7e5b1f95849d0984f3e Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Thu, 13 Feb 2025 09:50:38 -0800
Subject: [PATCH 23/31] update

---
 docs/openapi_generator/pyopenapi/generator.py |  2 +-
 .../openapi_generator/pyopenapi/operations.py |  2 ++
 llama_stack/apis/agents/agents.py             | 19 +++++++++++--------
 llama_stack/apis/datasets/datasets.py         |  4 ++--
 llama_stack/apis/models/models.py             |  4 ++--
 .../scoring_functions/scoring_functions.py    |  2 +-
 llama_stack/apis/shields/shields.py           |  2 +-
 llama_stack/apis/telemetry/telemetry.py       |  8 ++++----
 llama_stack/apis/tools/tools.py               |  6 +++---
 llama_stack/apis/vector_dbs/vector_dbs.py     |  4 ++--
 10 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/docs/openapi_generator/pyopenapi/generator.py b/docs/openapi_generator/pyopenapi/generator.py
index 86db8a06d9..e37c45690c 100644
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@@ -647,7 +647,6 @@ def _build_operation(self, op: EndpointOperation) -> Operation:
         description = "\n".join(
             filter(None, [doc_string.short_description, doc_string.long_description])
         )
-
         return Operation(
             tags=[op.defining_class.__name__],
             summary=None,
@@ -685,6 +684,7 @@ def generate(self) -> Document:
                 raise NotImplementedError(f"unknown HTTP method: {op.http_method}")
 
             route = op.get_route()
+            route = route.replace(":path", "")
             print(f"route: {route}")
             if route in paths:
                 paths[route].update(pathItem)
diff --git a/docs/openapi_generator/pyopenapi/operations.py b/docs/openapi_generator/pyopenapi/operations.py
index abeb169366..bf4d35c87f 100644
--- a/docs/openapi_generator/pyopenapi/operations.py
+++ b/docs/openapi_generator/pyopenapi/operations.py
@@ -130,6 +130,8 @@ def __getitem__(self, key: str) -> None:
 
 def _get_route_parameters(route: str) -> List[str]:
     extractor = _FormatParameterExtractor()
+    # Replace all occurrences of ":path" with empty string
+    route = route.replace(":path", "")
     route.format_map(extractor)
     return extractor.keys
 
diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py
index 785248633f..b20145be96 100644
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@@ -29,11 +29,11 @@
     SamplingParams,
     ToolCall,
     ToolChoice,
+    ToolConfig,
     ToolPromptFormat,
     ToolResponse,
     ToolResponseMessage,
     UserMessage,
-    ToolConfig,
 )
 from llama_stack.apis.safety import SafetyViolation
 from llama_stack.apis.tools import ToolDef
@@ -318,7 +318,7 @@ async def create_agent(
         agent_config: AgentConfig,
     ) -> AgentCreateResponse: ...
 
-    @webmethod(route="/agents/{agent_id}/session/{session_id}/turn", method="POST")
+    @webmethod(route="/agents/{agent_id:path}/session/{session_id:path}/turn", method="POST")
     async def create_agent_turn(
         self,
         agent_id: str,
@@ -335,7 +335,10 @@ async def create_agent_turn(
         tool_config: Optional[ToolConfig] = None,
     ) -> Union[Turn, AsyncIterator[AgentTurnResponseStreamChunk]]: ...
 
-    @webmethod(route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}", method="GET")
+    @webmethod(
+        route="/agents/{agent_id:path}/session/{session_id:path}/turn/{turn_id:path}",
+        method="GET",
+    )
     async def get_agents_turn(
         self,
         agent_id: str,
@@ -344,7 +347,7 @@ async def get_agents_turn(
     ) -> Turn: ...
 
     @webmethod(
-        route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}",
+        route="/agents/{agent_id:path}/session/{session_id:path}/turn/{turn_id:path}/step/{step_id:path}",
         method="GET",
     )
     async def get_agents_step(
@@ -355,14 +358,14 @@ async def get_agents_step(
         step_id: str,
     ) -> AgentStepResponse: ...
 
-    @webmethod(route="/agents/{agent_id}/session", method="POST")
+    @webmethod(route="/agents/{agent_id:path}/session", method="POST")
     async def create_agent_session(
         self,
         agent_id: str,
         session_name: str,
     ) -> AgentSessionCreateResponse: ...
 
-    @webmethod(route="/agents/{agent_id}/session/{session_id}", method="GET")
+    @webmethod(route="/agents/{agent_id:path}/session/{session_id:path}", method="GET")
     async def get_agents_session(
         self,
         session_id: str,
@@ -370,14 +373,14 @@ async def get_agents_session(
         turn_ids: Optional[List[str]] = None,
     ) -> Session: ...
 
-    @webmethod(route="/agents/{agent_id}/session/{session_id}", method="DELETE")
+    @webmethod(route="/agents/{agent_id:path}/session/{session_id:path}", method="DELETE")
     async def delete_agents_session(
         self,
         session_id: str,
         agent_id: str,
     ) -> None: ...
 
-    @webmethod(route="/agents/{agent_id}", method="DELETE")
+    @webmethod(route="/agents/{agent_id:path}", method="DELETE")
     async def delete_agent(
         self,
         agent_id: str,
diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py
index 5ad5bdcdb9..5e2b38697d 100644
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@@ -58,7 +58,7 @@ async def register_dataset(
         metadata: Optional[Dict[str, Any]] = None,
     ) -> None: ...
 
-    @webmethod(route="/datasets/{dataset_id}", method="GET")
+    @webmethod(route="/datasets/{dataset_id:path}", method="GET")
     async def get_dataset(
         self,
         dataset_id: str,
@@ -67,7 +67,7 @@ async def get_dataset(
     @webmethod(route="/datasets", method="GET")
     async def list_datasets(self) -> ListDatasetsResponse: ...
 
-    @webmethod(route="/datasets/{dataset_id}", method="DELETE")
+    @webmethod(route="/datasets/{dataset_id:path}", method="DELETE")
     async def unregister_dataset(
         self,
         dataset_id: str,
diff --git a/llama_stack/apis/models/models.py b/llama_stack/apis/models/models.py
index 3361c2836e..7e6d9854fa 100644
--- a/llama_stack/apis/models/models.py
+++ b/llama_stack/apis/models/models.py
@@ -62,7 +62,7 @@ class Models(Protocol):
     @webmethod(route="/models", method="GET")
     async def list_models(self) -> ListModelsResponse: ...
 
-    @webmethod(route="/models/{model_id}", method="GET")
+    @webmethod(route="/models/{model_id:path}", method="GET")
     async def get_model(
         self,
         model_id: str,
@@ -78,7 +78,7 @@ async def register_model(
         model_type: Optional[ModelType] = None,
     ) -> Model: ...
 
-    @webmethod(route="/models/{model_id}", method="DELETE")
+    @webmethod(route="/models/{model_id:path}", method="DELETE")
     async def unregister_model(
         self,
         model_id: str,
diff --git a/llama_stack/apis/scoring_functions/scoring_functions.py b/llama_stack/apis/scoring_functions/scoring_functions.py
index 3259795832..3fa40ffbfe 100644
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@@ -134,7 +134,7 @@ class ScoringFunctions(Protocol):
     @webmethod(route="/scoring-functions", method="GET")
     async def list_scoring_functions(self) -> ListScoringFunctionsResponse: ...
 
-    @webmethod(route="/scoring-functions/{scoring_fn_id}", method="GET")
+    @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="GET")
     async def get_scoring_function(self, scoring_fn_id: str, /) -> Optional[ScoringFn]: ...
 
     @webmethod(route="/scoring-functions", method="POST")
diff --git a/llama_stack/apis/shields/shields.py b/llama_stack/apis/shields/shields.py
index 3dd685b145..ae316ee536 100644
--- a/llama_stack/apis/shields/shields.py
+++ b/llama_stack/apis/shields/shields.py
@@ -48,7 +48,7 @@ class Shields(Protocol):
     @webmethod(route="/shields", method="GET")
     async def list_shields(self) -> ListShieldsResponse: ...
 
-    @webmethod(route="/shields/{identifier}", method="GET")
+    @webmethod(route="/shields/{identifier:path}", method="GET")
     async def get_shield(self, identifier: str) -> Optional[Shield]: ...
 
     @webmethod(route="/shields", method="POST")
diff --git a/llama_stack/apis/telemetry/telemetry.py b/llama_stack/apis/telemetry/telemetry.py
index 6272cc40b7..5622aaeac8 100644
--- a/llama_stack/apis/telemetry/telemetry.py
+++ b/llama_stack/apis/telemetry/telemetry.py
@@ -13,8 +13,8 @@
     Literal,
     Optional,
     Protocol,
-    Union,
     runtime_checkable,
+    Union,
 )
 
 from llama_models.llama3.api.datatypes import Primitive
@@ -224,13 +224,13 @@ async def query_traces(
         order_by: Optional[List[str]] = None,
     ) -> QueryTracesResponse: ...
 
-    @webmethod(route="/telemetry/traces/{trace_id}", method="GET")
+    @webmethod(route="/telemetry/traces/{trace_id:path}", method="GET")
     async def get_trace(self, trace_id: str) -> Trace: ...
 
-    @webmethod(route="/telemetry/traces/{trace_id}/spans/{span_id}", method="GET")
+    @webmethod(route="/telemetry/traces/{trace_id:path}/spans/{span_id:path}", method="GET")
     async def get_span(self, trace_id: str, span_id: str) -> Span: ...
 
-    @webmethod(route="/telemetry/spans/{span_id}/tree", method="GET")
+    @webmethod(route="/telemetry/spans/{span_id:path}/tree", method="GET")
     async def get_span_tree(
         self,
         span_id: str,
diff --git a/llama_stack/apis/tools/tools.py b/llama_stack/apis/tools/tools.py
index d6d806c531..a8e946b082 100644
--- a/llama_stack/apis/tools/tools.py
+++ b/llama_stack/apis/tools/tools.py
@@ -101,7 +101,7 @@ async def register_tool_group(
         """Register a tool group"""
         ...
 
-    @webmethod(route="/toolgroups/{toolgroup_id}", method="GET")
+    @webmethod(route="/toolgroups/{toolgroup_id:path}", method="GET")
     async def get_tool_group(
         self,
         toolgroup_id: str,
@@ -117,13 +117,13 @@ async def list_tools(self, toolgroup_id: Optional[str] = None) -> ListToolsRespo
         """List tools with optional tool group"""
         ...
 
-    @webmethod(route="/tools/{tool_name}", method="GET")
+    @webmethod(route="/tools/{tool_name:path}", method="GET")
     async def get_tool(
         self,
         tool_name: str,
     ) -> Tool: ...
 
-    @webmethod(route="/toolgroups/{toolgroup_id}", method="DELETE")
+    @webmethod(route="/toolgroups/{toolgroup_id:path}", method="DELETE")
     async def unregister_toolgroup(
         self,
         toolgroup_id: str,
diff --git a/llama_stack/apis/vector_dbs/vector_dbs.py b/llama_stack/apis/vector_dbs/vector_dbs.py
index 4b782e2d5b..1da2c128c2 100644
--- a/llama_stack/apis/vector_dbs/vector_dbs.py
+++ b/llama_stack/apis/vector_dbs/vector_dbs.py
@@ -46,7 +46,7 @@ class VectorDBs(Protocol):
     @webmethod(route="/vector-dbs", method="GET")
     async def list_vector_dbs(self) -> ListVectorDBsResponse: ...
 
-    @webmethod(route="/vector-dbs/{vector_db_id}", method="GET")
+    @webmethod(route="/vector-dbs/{vector_db_id:path}", method="GET")
     async def get_vector_db(
         self,
         vector_db_id: str,
@@ -62,5 +62,5 @@ async def register_vector_db(
         provider_vector_db_id: Optional[str] = None,
     ) -> VectorDB: ...
 
-    @webmethod(route="/vector-dbs/{vector_db_id}", method="DELETE")
+    @webmethod(route="/vector-dbs/{vector_db_id:path}", method="DELETE")
     async def unregister_vector_db(self, vector_db_id: str) -> None: ...

From 0e426d3cf8c7bc1f14dcfaf98b5212a80fdc0b1c Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Thu, 13 Feb 2025 12:14:57 -0500
Subject: [PATCH 24/31] chore: Link to Groq docs in the warning message for
 preview model (#1060)

This should be `llama-3.2-3b` instead of `llama-3.2-3b-instruct`.
---
 llama_stack/providers/remote/inference/groq/groq.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llama_stack/providers/remote/inference/groq/groq.py b/llama_stack/providers/remote/inference/groq/groq.py
index 4e6cc2d6bd..9b3c1abbf2 100644
--- a/llama_stack/providers/remote/inference/groq/groq.py
+++ b/llama_stack/providers/remote/inference/groq/groq.py
@@ -108,6 +108,7 @@ async def chat_completion(
                 "Groq only contains a preview version for llama-3.2-3b-instruct. "
                 "Preview models aren't recommended for production use. "
                 "They can be discontinued on short notice."
+                "More details: https://console.groq.com/docs/models"
             )
 
         request = convert_chat_completion_request(

From ceff63130d2c05874b5f8b9f0a5b67cad0cc164a Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 10 Feb 2025 11:21:51 -0800
Subject: [PATCH 25/31] deprecation in OpenAPI spec

---
 docs/_static/llama-stack-spec.html            | 96 +++++++++++++++++++
 docs/_static/llama-stack-spec.yaml            | 64 +++++++++++++
 docs/openapi_generator/pyopenapi/generator.py |  1 +
 llama_stack/apis/eval_tasks/eval_tasks.py     | 71 ++++++++++++++
 4 files changed, 232 insertions(+)
 create mode 100644 llama_stack/apis/eval_tasks/eval_tasks.py

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index b93f6a380a..2c5827d37f 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -40,6 +40,7 @@
         }
     ],
     "paths": {
+<<<<<<< HEAD
         "/v1/eval/tasks/{task_id}/evaluations": {
             "post": {
                 "responses": {
@@ -234,6 +235,8 @@
                 "deprecated": true
             }
         },
+=======
+>>>>>>> 974941be (deprecation in OpenAPI spec)
         "/v1/eval-tasks": {
             "get": {
                 "responses": {
@@ -242,18 +245,27 @@
                         "content": {
                             "application/json": {
                                 "schema": {
+<<<<<<< HEAD
                                     "$ref": "#/components/schemas/ListBenchmarksResponse"
+=======
+                                    "$ref": "#/components/schemas/ListEvalTasksResponse"
+>>>>>>> 974941be (deprecation in OpenAPI spec)
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
+<<<<<<< HEAD
                     "Benchmarks"
+=======
+                    "EvalTasks"
+>>>>>>> 974941be (deprecation in OpenAPI spec)
                 ],
                 "description": "",
                 "parameters": [],
                 "deprecated": true
+<<<<<<< HEAD
             },
             "post": {
                 "responses": {
@@ -318,6 +330,8 @@
                     "required": true
                 },
                 "deprecated": true
+=======
+>>>>>>> 974941be (deprecation in OpenAPI spec)
             }
         },
         "/v1/datasetio/rows": {
@@ -2645,7 +2659,89 @@
     "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema",
     "components": {
         "schemas": {
+<<<<<<< HEAD
             "AgentCandidate": {
+=======
+            "EvalTask": {
+                "type": "object",
+                "properties": {
+                    "identifier": {
+                        "type": "string"
+                    },
+                    "provider_resource_id": {
+                        "type": "string"
+                    },
+                    "provider_id": {
+                        "type": "string"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "eval_task",
+                        "default": "eval_task"
+                    },
+                    "dataset_id": {
+                        "type": "string"
+                    },
+                    "scoring_functions": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "identifier",
+                    "provider_resource_id",
+                    "provider_id",
+                    "type",
+                    "dataset_id",
+                    "scoring_functions",
+                    "metadata"
+                ]
+            },
+            "ListEvalTasksResponse": {
+                "type": "object",
+                "properties": {
+                    "data": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/EvalTask"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "data"
+                ]
+            },
+            "AppendRowsRequest": {
+>>>>>>> 974941be (deprecation in OpenAPI spec)
                 "type": "object",
                 "properties": {
                     "type": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index b30025020b..c743ce47aa 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -10,6 +10,7 @@ info:
 servers:
   - url: http://any-hosted-llama-stack.com
 paths:
+<<<<<<< HEAD
   /v1/eval/tasks/{task_id}/evaluations:
     post:
       responses:
@@ -125,6 +126,8 @@ paths:
           schema:
             type: string
       deprecated: true
+=======
+>>>>>>> 974941be (deprecation in OpenAPI spec)
   /v1/eval-tasks:
     get:
       responses:
@@ -133,6 +136,7 @@ paths:
           content:
             application/json:
               schema:
+<<<<<<< HEAD
                 $ref: '#/components/schemas/ListBenchmarksResponse'
       tags:
         - Benchmarks
@@ -179,6 +183,14 @@ paths:
               $ref: '#/components/schemas/DeprecatedRunEvalRequest'
         required: true
       deprecated: true
+=======
+                $ref: '#/components/schemas/ListEvalTasksResponse'
+      tags:
+        - EvalTasks
+      description: ''
+      parameters: []
+      deprecated: true
+>>>>>>> 974941be (deprecation in OpenAPI spec)
   /v1/datasetio/rows:
     get:
       responses:
@@ -1598,7 +1610,59 @@ jsonSchemaDialect: >-
   https://json-schema.org/draft/2020-12/schema
 components:
   schemas:
+<<<<<<< HEAD
     AgentCandidate:
+=======
+    EvalTask:
+      type: object
+      properties:
+        identifier:
+          type: string
+        provider_resource_id:
+          type: string
+        provider_id:
+          type: string
+        type:
+          type: string
+          const: eval_task
+          default: eval_task
+        dataset_id:
+          type: string
+        scoring_functions:
+          type: array
+          items:
+            type: string
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - identifier
+        - provider_resource_id
+        - provider_id
+        - type
+        - dataset_id
+        - scoring_functions
+        - metadata
+    ListEvalTasksResponse:
+      type: object
+      properties:
+        data:
+          type: array
+          items:
+            $ref: '#/components/schemas/EvalTask'
+      additionalProperties: false
+      required:
+        - data
+    AppendRowsRequest:
+>>>>>>> 974941be (deprecation in OpenAPI spec)
       type: object
       properties:
         type:
diff --git a/docs/openapi_generator/pyopenapi/generator.py b/docs/openapi_generator/pyopenapi/generator.py
index e37c45690c..0f3b997848 100644
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@@ -647,6 +647,7 @@ def _build_operation(self, op: EndpointOperation) -> Operation:
         description = "\n".join(
             filter(None, [doc_string.short_description, doc_string.long_description])
         )
+
         return Operation(
             tags=[op.defining_class.__name__],
             summary=None,
diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py
new file mode 100644
index 0000000000..9a26fd0c0d
--- /dev/null
+++ b/llama_stack/apis/eval_tasks/eval_tasks.py
@@ -0,0 +1,71 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
+
+from llama_models.schema_utils import json_schema_type, webmethod
+from pydantic import BaseModel, Field
+
+from llama_stack.apis.resource import Resource, ResourceType
+
+
+class CommonEvalTaskFields(BaseModel):
+    dataset_id: str
+    scoring_functions: List[str]
+    metadata: Dict[str, Any] = Field(
+        default_factory=dict,
+        description="Metadata for this evaluation task",
+    )
+
+
+@json_schema_type
+class EvalTask(CommonEvalTaskFields, Resource):
+    type: Literal[ResourceType.eval_task.value] = ResourceType.eval_task.value
+
+    @property
+    def task_id(self) -> str:
+        return self.identifier
+
+    @property
+    def provider_eval_task_id(self) -> str:
+        return self.provider_resource_id
+
+
+class EvalTaskInput(CommonEvalTaskFields, BaseModel):
+    task_id: str
+    provider_id: Optional[str] = None
+    provider_eval_task_id: Optional[str] = None
+
+
+class ListEvalTasksResponse(BaseModel):
+    data: List[EvalTask]
+
+
+@runtime_checkable
+class EvalTasks(Protocol):
+    @webmethod(route="/eval-tasks", method="GET")
+    async def DEPRECATED_list_eval_tasks_deprecated(
+        self,
+    ) -> ListEvalTasksResponse: ...
+
+    @webmethod(route="/eval/tasks", method="GET")
+    async def list_eval_tasks(self) -> ListEvalTasksResponse: ...
+
+    @webmethod(route="/eval/tasks/{task_id}", method="GET")
+    async def get_eval_task(
+        self,
+        task_id: str,
+    ) -> Optional[EvalTask]: ...
+
+    @webmethod(route="/eval/tasks", method="POST")
+    async def register_eval_task(
+        self,
+        task_id: str,
+        dataset_id: str,
+        scoring_functions: List[str],
+        provider_eval_task_id: Optional[str] = None,
+        provider_id: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> None: ...

From 9ce00ede9bb2f1db7a91d6d4379886294cad287a Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Thu, 13 Feb 2025 09:48:52 -0800
Subject: [PATCH 26/31] update

---
 docs/_static/llama-stack-spec.html            | 55 +++----------------
 docs/_static/llama-stack-spec.yaml            | 13 +----
 .../Llama_Stack_Benchmark_Evals.ipynb         |  2 +-
 .../distribution/routers/routing_tables.py    |  6 ++
 4 files changed, 18 insertions(+), 58 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 2c5827d37f..6cd8b47581 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -2665,15 +2665,6 @@
             "EvalTask": {
                 "type": "object",
                 "properties": {
-                    "identifier": {
-                        "type": "string"
-                    },
-                    "provider_resource_id": {
-                        "type": "string"
-                    },
-                    "provider_id": {
-                        "type": "string"
-                    },
                     "type": {
                         "type": "string",
                         "const": "eval_task",
@@ -2682,53 +2673,23 @@
                     "dataset_id": {
                         "type": "string"
                     },
-                    "scoring_functions": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        }
-                    },
-                    "metadata": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        }
+                    "config": {
+                        "$ref": "#/components/schemas/AgentConfig"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "identifier",
-                    "provider_resource_id",
-                    "provider_id",
                     "type",
-                    "dataset_id",
-                    "scoring_functions",
-                    "metadata"
+                    "config"
                 ]
             },
             "ListEvalTasksResponse": {
                 "type": "object",
                 "properties": {
-                    "data": {
+                    "sampling_params": {
+                        "$ref": "#/components/schemas/SamplingParams"
+                    },
+                    "input_shields": {
                         "type": "array",
                         "items": {
                             "$ref": "#/components/schemas/EvalTask"
@@ -2768,7 +2729,7 @@
                     "input_shields": {
                         "type": "array",
                         "items": {
-                            "type": "string"
+                            "$ref": "#/components/schemas/ToolDef"
                         }
                     },
                     "output_shields": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index c743ce47aa..19980de99a 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -1616,12 +1616,6 @@ components:
     EvalTask:
       type: object
       properties:
-        identifier:
-          type: string
-        provider_resource_id:
-          type: string
-        provider_id:
-          type: string
         type:
           type: string
           const: eval_task
@@ -1644,9 +1638,6 @@ components:
               - type: object
       additionalProperties: false
       required:
-        - identifier
-        - provider_resource_id
-        - provider_id
         - type
         - dataset_id
         - scoring_functions
@@ -1654,7 +1645,9 @@ components:
     ListEvalTasksResponse:
       type: object
       properties:
-        data:
+        sampling_params:
+          $ref: '#/components/schemas/SamplingParams'
+        input_shields:
           type: array
           items:
             $ref: '#/components/schemas/EvalTask'
diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
index 8eecf84abb..2861c8499d 100644
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@@ -1214,7 +1214,7 @@
         "            \"sampling_params\": {\n",
         "                \"strategy\": {\n",
         "                    \"type\": \"greedy\",\n",
-        "                },\n",
+        "                },b\n",
         "                \"max_tokens\": 4096,\n",
         "                \"repeat_penalty\": 1.0,\n",
         "            },\n",
diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index ec258af491..563c5c5ab1 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -472,16 +472,20 @@ async def register_benchmark(
     async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse:
         logger.warning("DEPRECATED: Use /eval/benchmarks instead")
         return await self.list_benchmarks()
+        return await self.list_benchmarks()
 
     async def DEPRECATED_get_eval_task(
         self,
+        task_id: str,
         eval_task_id: str,
     ) -> Optional[Benchmark]:
         logger.warning("DEPRECATED: Use /eval/benchmarks instead")
+        return await self.get_benchmark(task_id)
         return await self.get_benchmark(eval_task_id)
 
     async def DEPRECATED_register_eval_task(
         self,
+        task_id: str,
         eval_task_id: str,
         dataset_id: str,
         scoring_functions: List[str],
@@ -490,6 +494,8 @@ async def DEPRECATED_register_eval_task(
         metadata: Optional[Dict[str, Any]] = None,
     ) -> None:
         logger.warning("DEPRECATED: Use /eval/benchmarks instead")
+        return await self.register_benchmark(
+            benchmark_id=task_id,
         return await self.register_benchmark(
             benchmark_id=eval_task_id,
             dataset_id=dataset_id,

From 39980dc83f9c43e349156ad33acffc758f51db4d Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Wed, 12 Feb 2025 21:39:52 -0800
Subject: [PATCH 27/31] openapi

---
 docs/_static/llama-stack-spec.yaml | 51 +++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 19 deletions(-)

diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 19980de99a..c36c6e2571 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -2922,34 +2922,47 @@ components:
     TextDelta:
       type: object
       properties:
-        type:
+        name:
           type: string
-          const: text
-          default: text
-        text:
+        description:
           type: string
+        parameters:
+          type: array
+          items:
+            $ref: '#/components/schemas/ToolParameter'
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
       additionalProperties: false
       required:
-        - type
-        - text
-    ToolCallDelta:
+        - name
+    ToolParameter:
       type: object
       properties:
-        type:
+        name:
           type: string
-          const: tool_call
-          default: tool_call
-        tool_call:
+        parameter_type:
+          type: string
+        description:
+          type: string
+        required:
+          type: boolean
+          default: true
+        default:
           oneOf:
+            - type: 'null'
+            - type: boolean
+            - type: number
             - type: string
-            - $ref: '#/components/schemas/ToolCall'
-        parse_status:
-          type: string
-          enum:
-            - started
-            - in_progress
-            - failed
-            - succeeded
+            - type: array
+            - type: object
       additionalProperties: false
       required:
         - type

From 139d5bded62de6c6df1cbc796ba22bb156d050c4 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Thu, 13 Feb 2025 09:53:03 -0800
Subject: [PATCH 28/31] update

---
 docs/_static/llama-stack-spec.html |  59 +---------------
 docs/_static/llama-stack-spec.yaml | 108 +++++------------------------
 2 files changed, 20 insertions(+), 147 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 6cd8b47581..b93f6a380a 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -40,7 +40,6 @@
         }
     ],
     "paths": {
-<<<<<<< HEAD
         "/v1/eval/tasks/{task_id}/evaluations": {
             "post": {
                 "responses": {
@@ -235,8 +234,6 @@
                 "deprecated": true
             }
         },
-=======
->>>>>>> 974941be (deprecation in OpenAPI spec)
         "/v1/eval-tasks": {
             "get": {
                 "responses": {
@@ -245,27 +242,18 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-<<<<<<< HEAD
                                     "$ref": "#/components/schemas/ListBenchmarksResponse"
-=======
-                                    "$ref": "#/components/schemas/ListEvalTasksResponse"
->>>>>>> 974941be (deprecation in OpenAPI spec)
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-<<<<<<< HEAD
                     "Benchmarks"
-=======
-                    "EvalTasks"
->>>>>>> 974941be (deprecation in OpenAPI spec)
                 ],
                 "description": "",
                 "parameters": [],
                 "deprecated": true
-<<<<<<< HEAD
             },
             "post": {
                 "responses": {
@@ -330,8 +318,6 @@
                     "required": true
                 },
                 "deprecated": true
-=======
->>>>>>> 974941be (deprecation in OpenAPI spec)
             }
         },
         "/v1/datasetio/rows": {
@@ -2659,50 +2645,7 @@
     "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema",
     "components": {
         "schemas": {
-<<<<<<< HEAD
             "AgentCandidate": {
-=======
-            "EvalTask": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "eval_task",
-                        "default": "eval_task"
-                    },
-                    "dataset_id": {
-                        "type": "string"
-                    },
-                    "config": {
-                        "$ref": "#/components/schemas/AgentConfig"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "config"
-                ]
-            },
-            "ListEvalTasksResponse": {
-                "type": "object",
-                "properties": {
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams"
-                    },
-                    "input_shields": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/EvalTask"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "data"
-                ]
-            },
-            "AppendRowsRequest": {
->>>>>>> 974941be (deprecation in OpenAPI spec)
                 "type": "object",
                 "properties": {
                     "type": {
@@ -2729,7 +2672,7 @@
                     "input_shields": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/ToolDef"
+                            "type": "string"
                         }
                     },
                     "output_shields": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index c36c6e2571..b30025020b 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -10,7 +10,6 @@ info:
 servers:
   - url: http://any-hosted-llama-stack.com
 paths:
-<<<<<<< HEAD
   /v1/eval/tasks/{task_id}/evaluations:
     post:
       responses:
@@ -126,8 +125,6 @@ paths:
           schema:
             type: string
       deprecated: true
-=======
->>>>>>> 974941be (deprecation in OpenAPI spec)
   /v1/eval-tasks:
     get:
       responses:
@@ -136,7 +133,6 @@ paths:
           content:
             application/json:
               schema:
-<<<<<<< HEAD
                 $ref: '#/components/schemas/ListBenchmarksResponse'
       tags:
         - Benchmarks
@@ -183,14 +179,6 @@ paths:
               $ref: '#/components/schemas/DeprecatedRunEvalRequest'
         required: true
       deprecated: true
-=======
-                $ref: '#/components/schemas/ListEvalTasksResponse'
-      tags:
-        - EvalTasks
-      description: ''
-      parameters: []
-      deprecated: true
->>>>>>> 974941be (deprecation in OpenAPI spec)
   /v1/datasetio/rows:
     get:
       responses:
@@ -1610,52 +1598,7 @@ jsonSchemaDialect: >-
   https://json-schema.org/draft/2020-12/schema
 components:
   schemas:
-<<<<<<< HEAD
     AgentCandidate:
-=======
-    EvalTask:
-      type: object
-      properties:
-        type:
-          type: string
-          const: eval_task
-          default: eval_task
-        dataset_id:
-          type: string
-        scoring_functions:
-          type: array
-          items:
-            type: string
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-      additionalProperties: false
-      required:
-        - type
-        - dataset_id
-        - scoring_functions
-        - metadata
-    ListEvalTasksResponse:
-      type: object
-      properties:
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-        input_shields:
-          type: array
-          items:
-            $ref: '#/components/schemas/EvalTask'
-      additionalProperties: false
-      required:
-        - data
-    AppendRowsRequest:
->>>>>>> 974941be (deprecation in OpenAPI spec)
       type: object
       properties:
         type:
@@ -2922,47 +2865,34 @@ components:
     TextDelta:
       type: object
       properties:
-        name:
+        type:
           type: string
-        description:
+          const: text
+          default: text
+        text:
           type: string
-        parameters:
-          type: array
-          items:
-            $ref: '#/components/schemas/ToolParameter'
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
       additionalProperties: false
       required:
-        - name
-    ToolParameter:
+        - type
+        - text
+    ToolCallDelta:
       type: object
       properties:
-        name:
-          type: string
-        parameter_type:
-          type: string
-        description:
+        type:
           type: string
-        required:
-          type: boolean
-          default: true
-        default:
+          const: tool_call
+          default: tool_call
+        tool_call:
           oneOf:
-            - type: 'null'
-            - type: boolean
-            - type: number
             - type: string
-            - type: array
-            - type: object
+            - $ref: '#/components/schemas/ToolCall'
+        parse_status:
+          type: string
+          enum:
+            - started
+            - in_progress
+            - failed
+            - succeeded
       additionalProperties: false
       required:
         - type

From e183ec988f93bb2d89dc5526ea7937129ddd9f17 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Thu, 13 Feb 2025 09:58:10 -0800
Subject: [PATCH 29/31] update

---
 docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb   | 2 +-
 llama_stack/distribution/routers/routing_tables.py | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
index 2861c8499d..8eecf84abb 100644
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@@ -1214,7 +1214,7 @@
         "            \"sampling_params\": {\n",
         "                \"strategy\": {\n",
         "                    \"type\": \"greedy\",\n",
-        "                },b\n",
+        "                },\n",
         "                \"max_tokens\": 4096,\n",
         "                \"repeat_penalty\": 1.0,\n",
         "            },\n",
diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index 563c5c5ab1..ec258af491 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -472,20 +472,16 @@ async def register_benchmark(
     async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse:
         logger.warning("DEPRECATED: Use /eval/benchmarks instead")
         return await self.list_benchmarks()
-        return await self.list_benchmarks()
 
     async def DEPRECATED_get_eval_task(
         self,
-        task_id: str,
         eval_task_id: str,
     ) -> Optional[Benchmark]:
         logger.warning("DEPRECATED: Use /eval/benchmarks instead")
-        return await self.get_benchmark(task_id)
         return await self.get_benchmark(eval_task_id)
 
     async def DEPRECATED_register_eval_task(
         self,
-        task_id: str,
         eval_task_id: str,
         dataset_id: str,
         scoring_functions: List[str],
@@ -494,8 +490,6 @@ async def DEPRECATED_register_eval_task(
         metadata: Optional[Dict[str, Any]] = None,
     ) -> None:
         logger.warning("DEPRECATED: Use /eval/benchmarks instead")
-        return await self.register_benchmark(
-            benchmark_id=task_id,
         return await self.register_benchmark(
             benchmark_id=eval_task_id,
             dataset_id=dataset_id,

From c56db9e7b258f3cfd21fa6156ea8c8ef3f68dfb5 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Thu, 13 Feb 2025 10:11:18 -0800
Subject: [PATCH 30/31] compeltely remove eval_task

---
 llama_stack/apis/eval_tasks/eval_tasks.py | 71 -----------------------
 1 file changed, 71 deletions(-)
 delete mode 100644 llama_stack/apis/eval_tasks/eval_tasks.py

diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py
deleted file mode 100644
index 9a26fd0c0d..0000000000
--- a/llama_stack/apis/eval_tasks/eval_tasks.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
-
-from llama_models.schema_utils import json_schema_type, webmethod
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.resource import Resource, ResourceType
-
-
-class CommonEvalTaskFields(BaseModel):
-    dataset_id: str
-    scoring_functions: List[str]
-    metadata: Dict[str, Any] = Field(
-        default_factory=dict,
-        description="Metadata for this evaluation task",
-    )
-
-
-@json_schema_type
-class EvalTask(CommonEvalTaskFields, Resource):
-    type: Literal[ResourceType.eval_task.value] = ResourceType.eval_task.value
-
-    @property
-    def task_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_eval_task_id(self) -> str:
-        return self.provider_resource_id
-
-
-class EvalTaskInput(CommonEvalTaskFields, BaseModel):
-    task_id: str
-    provider_id: Optional[str] = None
-    provider_eval_task_id: Optional[str] = None
-
-
-class ListEvalTasksResponse(BaseModel):
-    data: List[EvalTask]
-
-
-@runtime_checkable
-class EvalTasks(Protocol):
-    @webmethod(route="/eval-tasks", method="GET")
-    async def DEPRECATED_list_eval_tasks_deprecated(
-        self,
-    ) -> ListEvalTasksResponse: ...
-
-    @webmethod(route="/eval/tasks", method="GET")
-    async def list_eval_tasks(self) -> ListEvalTasksResponse: ...
-
-    @webmethod(route="/eval/tasks/{task_id}", method="GET")
-    async def get_eval_task(
-        self,
-        task_id: str,
-    ) -> Optional[EvalTask]: ...
-
-    @webmethod(route="/eval/tasks", method="POST")
-    async def register_eval_task(
-        self,
-        task_id: str,
-        dataset_id: str,
-        scoring_functions: List[str],
-        provider_eval_task_id: Optional[str] = None,
-        provider_id: Optional[str] = None,
-        metadata: Optional[Dict[str, Any]] = None,
-    ) -> None: ...

From b0ad0c109014cf86be258651fd8ecc8ebc210b90 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Thu, 13 Feb 2025 10:13:10 -0800
Subject: [PATCH 31/31] precommit

---
 llama_stack/apis/telemetry/telemetry.py                  | 2 +-
 llama_stack/distribution/datatypes.py                    | 1 -
 llama_stack/distribution/routers/routing_tables.py       | 1 -
 llama_stack/providers/datatypes.py                       | 1 -
 llama_stack/providers/inline/eval/meta_reference/eval.py | 1 -
 llama_stack/providers/tests/resolver.py                  | 1 -
 6 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/llama_stack/apis/telemetry/telemetry.py b/llama_stack/apis/telemetry/telemetry.py
index 5622aaeac8..63ae1dc738 100644
--- a/llama_stack/apis/telemetry/telemetry.py
+++ b/llama_stack/apis/telemetry/telemetry.py
@@ -13,8 +13,8 @@
     Literal,
     Optional,
     Protocol,
-    runtime_checkable,
     Union,
+    runtime_checkable,
 )
 
 from llama_models.llama3.api.datatypes import Primitive
diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py
index 75ab73b9ba..f62996081b 100644
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@@ -9,7 +9,6 @@
 from pydantic import BaseModel, Field
 
 from llama_stack.apis.benchmarks import Benchmark, BenchmarkInput
-
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Dataset, DatasetInput
 from llama_stack.apis.eval import Eval
diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index ec258af491..2cddc3970d 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -10,7 +10,6 @@
 from pydantic import TypeAdapter
 
 from llama_stack.apis.benchmarks import Benchmark, Benchmarks, ListBenchmarksResponse
-
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.common.type_system import ParamType
 from llama_stack.apis.datasets import Dataset, Datasets, ListDatasetsResponse
diff --git a/llama_stack/providers/datatypes.py b/llama_stack/providers/datatypes.py
index 5d56505af1..b92f9dc0a0 100644
--- a/llama_stack/providers/datatypes.py
+++ b/llama_stack/providers/datatypes.py
@@ -11,7 +11,6 @@
 from pydantic import BaseModel, Field
 
 from llama_stack.apis.benchmarks import Benchmark
-
 from llama_stack.apis.datasets import Dataset
 from llama_stack.apis.datatypes import Api
 from llama_stack.apis.models import Model
diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py
index 9827ff2081..cd99c9ad89 100644
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@@ -15,7 +15,6 @@
 from llama_stack.apis.scoring import Scoring
 from llama_stack.distribution.datatypes import Api
 from llama_stack.providers.datatypes import BenchmarksProtocolPrivate
-
 from llama_stack.providers.inline.agents.meta_reference.agent_instance import (
     MEMORY_QUERY_TOOL,
 )
diff --git a/llama_stack/providers/tests/resolver.py b/llama_stack/providers/tests/resolver.py
index 092514079a..76343b7f48 100644
--- a/llama_stack/providers/tests/resolver.py
+++ b/llama_stack/providers/tests/resolver.py
@@ -11,7 +11,6 @@
 from pydantic import BaseModel
 
 from llama_stack.apis.benchmarks import BenchmarkInput
-
 from llama_stack.apis.datasets import DatasetInput
 from llama_stack.apis.models import ModelInput
 from llama_stack.apis.scoring_functions import ScoringFnInput