From f1844a88c41a08d085c7b5165b714096ad0f5086 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 10 Feb 2025 09:37:21 -0800 Subject: [PATCH 01/31] update eval-tasks -> eval/task --- docs/_static/llama-stack-spec.html | 52 ++++++++++++----------- docs/_static/llama-stack-spec.yaml | 35 +++++++-------- llama_stack/apis/eval_tasks/eval_tasks.py | 6 +-- 3 files changed, 48 insertions(+), 45 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 151ac14516..84e20f3602 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -728,7 +728,7 @@ ] } }, - "/v1/eval-tasks/{eval_task_id}": { + "/v1/eval/tasks/{task_id}": { "get": { "responses": { "200": { @@ -756,7 +756,7 @@ "parameters": [ { "name": "eval_task_id", - "in": "path", + "in": "query", "required": true, "schema": { "type": "string" @@ -1503,7 +1503,7 @@ } } }, - "/v1/eval-tasks": { + "/v1/eval/tasks/": { "get": { "responses": { "200": { @@ -1522,28 +1522,6 @@ ], "description": "", "parameters": [] - }, - "post": { - "responses": { - "200": { - "description": "OK" - } - }, - "tags": [ - "EvalTasks" - ], - "description": "", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RegisterEvalTaskRequest" - } - } - }, - "required": true - } } }, "/v1/models": { @@ -2121,6 +2099,30 @@ ] } }, + "/v1/eval/tasks": { + "post": { + "responses": { + "200": { + "description": "OK" + } + }, + "tags": [ + "EvalTasks" + ], + "description": "", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RegisterEvalTaskRequest" + } + } + }, + "required": true + } + } + }, "/v1/eval/tasks/{task_id}/jobs": { "post": { "responses": { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 37fba45412..700a3071c8 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -440,7 +440,7 @@ paths: required: true schema: type: string - /v1/eval-tasks/{eval_task_id}: + /v1/eval/tasks/{task_id}: get: responses: '200': @@ -456,7 +456,7 @@ paths: description: '' parameters: - name: eval_task_id - in: path + in: query required: true schema: type: string @@ -895,7 +895,7 @@ paths: schema: $ref: '#/components/schemas/RegisterDatasetRequest' required: true - /v1/eval-tasks: + /v1/eval/tasks/: get: responses: '200': @@ -908,20 +908,6 @@ paths: - EvalTasks description: '' parameters: [] - post: - responses: - '200': - description: OK - tags: - - EvalTasks - description: '' - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RegisterEvalTaskRequest' - required: true /v1/models: get: responses: @@ -1278,6 +1264,21 @@ paths: type: array items: type: string + /v1/eval/tasks: + post: + responses: + '200': + description: OK + tags: + - EvalTasks + description: '' + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RegisterEvalTaskRequest' + required: true /v1/eval/tasks/{task_id}/jobs: post: responses: diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py index a0a5330553..3600589d13 100644 --- a/llama_stack/apis/eval_tasks/eval_tasks.py +++ b/llama_stack/apis/eval_tasks/eval_tasks.py @@ -45,16 +45,16 @@ class ListEvalTasksResponse(BaseModel): @runtime_checkable class EvalTasks(Protocol): - @webmethod(route="/eval-tasks", method="GET") + @webmethod(route="/eval/tasks/", method="GET") async def list_eval_tasks(self) -> ListEvalTasksResponse: ... - @webmethod(route="/eval-tasks/{eval_task_id}", method="GET") + @webmethod(route="/eval/tasks/{task_id}", method="GET") async def get_eval_task( self, eval_task_id: str, ) -> Optional[EvalTask]: ... - @webmethod(route="/eval-tasks", method="POST") + @webmethod(route="/eval/tasks", method="POST") async def register_eval_task( self, eval_task_id: str, From 5fe3ddb27d706f2141959f15ccc40cac975cf578 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 10 Feb 2025 09:41:01 -0800 Subject: [PATCH 02/31] update eval_task_id -> task_id --- llama_stack/apis/eval_tasks/eval_tasks.py | 4 ++-- llama_stack/distribution/routers/routing_tables.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py index 3600589d13..f36d44c887 100644 --- a/llama_stack/apis/eval_tasks/eval_tasks.py +++ b/llama_stack/apis/eval_tasks/eval_tasks.py @@ -51,13 +51,13 @@ async def list_eval_tasks(self) -> ListEvalTasksResponse: ... @webmethod(route="/eval/tasks/{task_id}", method="GET") async def get_eval_task( self, - eval_task_id: str, + task_id: str, ) -> Optional[EvalTask]: ... @webmethod(route="/eval/tasks", method="POST") async def register_eval_task( self, - eval_task_id: str, + task_id: str, dataset_id: str, scoring_functions: List[str], provider_eval_task_id: Optional[str] = None, diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index 68fafd8ee9..3f5dea66d1 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -432,8 +432,8 @@ class EvalTasksRoutingTable(CommonRoutingTableImpl, EvalTasks): async def list_eval_tasks(self) -> ListEvalTasksResponse: return ListEvalTasksResponse(data=await self.get_all_with_type("eval_task")) - async def get_eval_task(self, eval_task_id: str) -> Optional[EvalTask]: - return await self.get_object_by_identifier("eval_task", eval_task_id) + async def get_eval_task(self, task_id: str) -> Optional[EvalTask]: + return await self.get_object_by_identifier("eval_task", task_id) async def register_eval_task( self, From b11c38ea552c0e5cc0ebefda47e6c775f4556372 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 10 Feb 2025 09:41:21 -0800 Subject: [PATCH 03/31] openapi --- docs/_static/llama-stack-spec.html | 8 ++++---- docs/_static/llama-stack-spec.yaml | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 84e20f3602..3106bff86b 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -755,8 +755,8 @@ "description": "", "parameters": [ { - "name": "eval_task_id", - "in": "query", + "name": "task_id", + "in": "path", "required": true, "schema": { "type": "string" @@ -7226,7 +7226,7 @@ "RegisterEvalTaskRequest": { "type": "object", "properties": { - "eval_task_id": { + "task_id": { "type": "string" }, "dataset_id": { @@ -7272,7 +7272,7 @@ }, "additionalProperties": false, "required": [ - "eval_task_id", + "task_id", "dataset_id", "scoring_functions" ] diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 700a3071c8..9b4220018d 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -455,8 +455,8 @@ paths: - EvalTasks description: '' parameters: - - name: eval_task_id - in: query + - name: task_id + in: path required: true schema: type: string @@ -4599,7 +4599,7 @@ components: RegisterEvalTaskRequest: type: object properties: - eval_task_id: + task_id: type: string dataset_id: type: string @@ -4623,7 +4623,7 @@ components: - type: object additionalProperties: false required: - - eval_task_id + - task_id - dataset_id - scoring_functions RegisterModelRequest: From e013b9066c2efaefee8c491d441b466a8b379777 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 10 Feb 2025 10:47:28 -0800 Subject: [PATCH 04/31] fix path --- docs/_static/llama-stack-spec.html | 48 +++++++++---------- docs/_static/llama-stack-spec.yaml | 31 ++++++------ llama_stack/apis/eval_tasks/eval_tasks.py | 6 +-- .../distribution/routers/routing_tables.py | 6 +-- 4 files changed, 44 insertions(+), 47 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 3106bff86b..6d3e5b93b4 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -1503,7 +1503,7 @@ } } }, - "/v1/eval/tasks/": { + "/v1/eval/tasks": { "get": { "responses": { "200": { @@ -1522,6 +1522,28 @@ ], "description": "", "parameters": [] + }, + "post": { + "responses": { + "200": { + "description": "OK" + } + }, + "tags": [ + "EvalTasks" + ], + "description": "", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RegisterEvalTaskRequest" + } + } + }, + "required": true + } } }, "/v1/models": { @@ -2099,30 +2121,6 @@ ] } }, - "/v1/eval/tasks": { - "post": { - "responses": { - "200": { - "description": "OK" - } - }, - "tags": [ - "EvalTasks" - ], - "description": "", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RegisterEvalTaskRequest" - } - } - }, - "required": true - } - } - }, "/v1/eval/tasks/{task_id}/jobs": { "post": { "responses": { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 9b4220018d..aa25c88f8b 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -895,7 +895,7 @@ paths: schema: $ref: '#/components/schemas/RegisterDatasetRequest' required: true - /v1/eval/tasks/: + /v1/eval/tasks: get: responses: '200': @@ -908,6 +908,20 @@ paths: - EvalTasks description: '' parameters: [] + post: + responses: + '200': + description: OK + tags: + - EvalTasks + description: '' + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RegisterEvalTaskRequest' + required: true /v1/models: get: responses: @@ -1264,21 +1278,6 @@ paths: type: array items: type: string - /v1/eval/tasks: - post: - responses: - '200': - description: OK - tags: - - EvalTasks - description: '' - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RegisterEvalTaskRequest' - required: true /v1/eval/tasks/{task_id}/jobs: post: responses: diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py index f36d44c887..0a1a27885d 100644 --- a/llama_stack/apis/eval_tasks/eval_tasks.py +++ b/llama_stack/apis/eval_tasks/eval_tasks.py @@ -25,7 +25,7 @@ class EvalTask(CommonEvalTaskFields, Resource): type: Literal[ResourceType.eval_task.value] = ResourceType.eval_task.value @property - def eval_task_id(self) -> str: + def task_id(self) -> str: return self.identifier @property @@ -34,7 +34,7 @@ def provider_eval_task_id(self) -> str: class EvalTaskInput(CommonEvalTaskFields, BaseModel): - eval_task_id: str + task_id: str provider_id: Optional[str] = None provider_eval_task_id: Optional[str] = None @@ -45,7 +45,7 @@ class ListEvalTasksResponse(BaseModel): @runtime_checkable class EvalTasks(Protocol): - @webmethod(route="/eval/tasks/", method="GET") + @webmethod(route="/eval/tasks", method="GET") async def list_eval_tasks(self) -> ListEvalTasksResponse: ... @webmethod(route="/eval/tasks/{task_id}", method="GET") diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index 3f5dea66d1..0664e310a5 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -437,7 +437,7 @@ async def get_eval_task(self, task_id: str) -> Optional[EvalTask]: async def register_eval_task( self, - eval_task_id: str, + task_id: str, dataset_id: str, scoring_functions: List[str], metadata: Optional[Dict[str, Any]] = None, @@ -454,9 +454,9 @@ async def register_eval_task( "No provider specified and multiple providers available. Please specify a provider_id." ) if provider_eval_task_id is None: - provider_eval_task_id = eval_task_id + provider_eval_task_id = task_id eval_task = EvalTask( - identifier=eval_task_id, + identifier=task_id, dataset_id=dataset_id, scoring_functions=scoring_functions, metadata=metadata, From 79e7253625529d68a4465ef045cece2d95de3125 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 10 Feb 2025 11:21:51 -0800 Subject: [PATCH 05/31] deprecation in OpenAPI spec --- docs/_static/llama-stack-spec.html | 178 ++++++++++-------- docs/_static/llama-stack-spec.yaml | 110 ++++++----- docs/openapi_generator/pyopenapi/generator.py | 6 +- .../pyopenapi/specification.py | 1 + llama_stack/apis/eval_tasks/eval_tasks.py | 5 + 5 files changed, 173 insertions(+), 127 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 6d3e5b93b4..459071d3f3 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -40,6 +40,28 @@ } ], "paths": { + "/v1/eval-tasks": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListEvalTasksResponse" + } + } + } + } + }, + "tags": [ + "EvalTasks" + ], + "description": "", + "parameters": [], + "deprecated": true + } + }, "/v1/datasetio/rows": { "get": { "responses": { @@ -2365,6 +2387,84 @@ "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema", "components": { "schemas": { + "EvalTask": { + "type": "object", + "properties": { + "identifier": { + "type": "string" + }, + "provider_resource_id": { + "type": "string" + }, + "provider_id": { + "type": "string" + }, + "type": { + "type": "string", + "const": "eval_task", + "default": "eval_task" + }, + "dataset_id": { + "type": "string" + }, + "scoring_functions": { + "type": "array", + "items": { + "type": "string" + } + }, + "metadata": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "identifier", + "provider_resource_id", + "provider_id", + "type", + "dataset_id", + "scoring_functions", + "metadata" + ] + }, + "ListEvalTasksResponse": { + "type": "object", + "properties": { + "data": { + "type": "array", + "items": { + "$ref": "#/components/schemas/EvalTask" + } + } + }, + "additionalProperties": false, + "required": [ + "data" + ] + }, "AppendRowsRequest": { "type": "object", "properties": { @@ -5208,69 +5308,6 @@ "type" ] }, - "EvalTask": { - "type": "object", - "properties": { - "identifier": { - "type": "string" - }, - "provider_resource_id": { - "type": "string" - }, - "provider_id": { - "type": "string" - }, - "type": { - "type": "string", - "const": "eval_task", - "default": "eval_task" - }, - "dataset_id": { - "type": "string" - }, - "scoring_functions": { - "type": "array", - "items": { - "type": "string" - } - }, - "metadata": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - } - }, - "additionalProperties": false, - "required": [ - "identifier", - "provider_resource_id", - "provider_id", - "type", - "dataset_id", - "scoring_functions", - "metadata" - ] - }, "Model": { "type": "object", "properties": { @@ -6164,21 +6201,6 @@ "data" ] }, - "ListEvalTasksResponse": { - "type": "object", - "properties": { - "data": { - "type": "array", - "items": { - "$ref": "#/components/schemas/EvalTask" - } - } - }, - "additionalProperties": false, - "required": [ - "data" - ] - }, "ListModelsResponse": { "type": "object", "properties": { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index aa25c88f8b..b37f09ef88 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -10,6 +10,20 @@ info: servers: - url: http://any-hosted-llama-stack.com paths: + /v1/eval-tasks: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/ListEvalTasksResponse' + tags: + - EvalTasks + description: '' + parameters: [] + deprecated: true /v1/datasetio/rows: get: responses: @@ -1429,6 +1443,54 @@ jsonSchemaDialect: >- https://json-schema.org/draft/2020-12/schema components: schemas: + EvalTask: + type: object + properties: + identifier: + type: string + provider_resource_id: + type: string + provider_id: + type: string + type: + type: string + const: eval_task + default: eval_task + dataset_id: + type: string + scoring_functions: + type: array + items: + type: string + metadata: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - identifier + - provider_resource_id + - provider_id + - type + - dataset_id + - scoring_functions + - metadata + ListEvalTasksResponse: + type: object + properties: + data: + type: array + items: + $ref: '#/components/schemas/EvalTask' + additionalProperties: false + required: + - data AppendRowsRequest: type: object properties: @@ -3354,44 +3416,6 @@ components: additionalProperties: false required: - type - EvalTask: - type: object - properties: - identifier: - type: string - provider_resource_id: - type: string - provider_id: - type: string - type: - type: string - const: eval_task - default: eval_task - dataset_id: - type: string - scoring_functions: - type: array - items: - type: string - metadata: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - additionalProperties: false - required: - - identifier - - provider_resource_id - - provider_id - - type - - dataset_id - - scoring_functions - - metadata Model: type: object properties: @@ -3930,16 +3954,6 @@ components: additionalProperties: false required: - data - ListEvalTasksResponse: - type: object - properties: - data: - type: array - items: - $ref: '#/components/schemas/EvalTask' - additionalProperties: false - required: - - data ListModelsResponse: type: object properties: diff --git a/docs/openapi_generator/pyopenapi/generator.py b/docs/openapi_generator/pyopenapi/generator.py index f0d30a0e65..86db8a06d9 100644 --- a/docs/openapi_generator/pyopenapi/generator.py +++ b/docs/openapi_generator/pyopenapi/generator.py @@ -644,7 +644,10 @@ def _build_operation(self, op: EndpointOperation) -> Operation: else: callbacks = None - description = "\n".join(filter(None, [doc_string.short_description, doc_string.long_description])) + description = "\n".join( + filter(None, [doc_string.short_description, doc_string.long_description]) + ) + return Operation( tags=[op.defining_class.__name__], summary=None, @@ -654,6 +657,7 @@ def _build_operation(self, op: EndpointOperation) -> Operation: requestBody=requestBody, responses=responses, callbacks=callbacks, + deprecated=True if "DEPRECATED" in op.func_name else None, security=[] if op.public else None, ) diff --git a/docs/openapi_generator/pyopenapi/specification.py b/docs/openapi_generator/pyopenapi/specification.py index 4b54295c56..f96de58b69 100644 --- a/docs/openapi_generator/pyopenapi/specification.py +++ b/docs/openapi_generator/pyopenapi/specification.py @@ -117,6 +117,7 @@ class Operation: requestBody: Optional[RequestBody] = None callbacks: Optional[Dict[str, "Callback"]] = None security: Optional[List["SecurityRequirement"]] = None + deprecated: Optional[bool] = None @dataclass diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py index 0a1a27885d..9a26fd0c0d 100644 --- a/llama_stack/apis/eval_tasks/eval_tasks.py +++ b/llama_stack/apis/eval_tasks/eval_tasks.py @@ -45,6 +45,11 @@ class ListEvalTasksResponse(BaseModel): @runtime_checkable class EvalTasks(Protocol): + @webmethod(route="/eval-tasks", method="GET") + async def DEPRECATED_list_eval_tasks_deprecated( + self, + ) -> ListEvalTasksResponse: ... + @webmethod(route="/eval/tasks", method="GET") async def list_eval_tasks(self) -> ListEvalTasksResponse: ... From 65ffcddd84269330e2921784616442056c7c2453 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 10 Feb 2025 11:35:21 -0800 Subject: [PATCH 06/31] deprecation --- docs/_static/llama-stack-spec.html | 115 ++++++++++++++++++ docs/_static/llama-stack-spec.yaml | 66 ++++++++++ llama_stack/apis/eval_tasks/eval_tasks.py | 25 +++- .../distribution/routers/routing_tables.py | 20 +++ 4 files changed, 221 insertions(+), 5 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 459071d3f3..188ba96a4a 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -40,6 +40,44 @@ } ], "paths": { + "/v1/eval-tasks/{eval_task_id}": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "oneOf": [ + { + "$ref": "#/components/schemas/EvalTask" + }, + { + "type": "null" + } + ] + } + } + } + } + }, + "tags": [ + "EvalTasks" + ], + "description": "", + "parameters": [ + { + "name": "eval_task_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ], + "deprecated": true + } + }, "/v1/eval-tasks": { "get": { "responses": { @@ -60,6 +98,29 @@ "description": "", "parameters": [], "deprecated": true + }, + "post": { + "responses": { + "200": { + "description": "OK" + } + }, + "tags": [ + "EvalTasks" + ], + "description": "", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/DeprecatedRegisterEvalTaskRequest" + } + } + }, + "required": true + }, + "deprecated": true } }, "/v1/datasetio/rows": { @@ -2465,6 +2526,60 @@ "data" ] }, + "DeprecatedRegisterEvalTaskRequest": { + "type": "object", + "properties": { + "eval_task_id": { + "type": "string" + }, + "dataset_id": { + "type": "string" + }, + "scoring_functions": { + "type": "array", + "items": { + "type": "string" + } + }, + "provider_eval_task_id": { + "type": "string" + }, + "provider_id": { + "type": "string" + }, + "metadata": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "eval_task_id", + "dataset_id", + "scoring_functions" + ] + }, "AppendRowsRequest": { "type": "object", "properties": { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index b37f09ef88..ed5b71d0d8 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -10,6 +10,27 @@ info: servers: - url: http://any-hosted-llama-stack.com paths: + /v1/eval-tasks/{eval_task_id}: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + oneOf: + - $ref: '#/components/schemas/EvalTask' + - type: 'null' + tags: + - EvalTasks + description: '' + parameters: + - name: eval_task_id + in: path + required: true + schema: + type: string + deprecated: true /v1/eval-tasks: get: responses: @@ -24,6 +45,21 @@ paths: description: '' parameters: [] deprecated: true + post: + responses: + '200': + description: OK + tags: + - EvalTasks + description: '' + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/DeprecatedRegisterEvalTaskRequest' + required: true + deprecated: true /v1/datasetio/rows: get: responses: @@ -1491,6 +1527,36 @@ components: additionalProperties: false required: - data + DeprecatedRegisterEvalTaskRequest: + type: object + properties: + eval_task_id: + type: string + dataset_id: + type: string + scoring_functions: + type: array + items: + type: string + provider_eval_task_id: + type: string + provider_id: + type: string + metadata: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - eval_task_id + - dataset_id + - scoring_functions AppendRowsRequest: type: object properties: diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py index 9a26fd0c0d..6d12fd2f7f 100644 --- a/llama_stack/apis/eval_tasks/eval_tasks.py +++ b/llama_stack/apis/eval_tasks/eval_tasks.py @@ -45,11 +45,6 @@ class ListEvalTasksResponse(BaseModel): @runtime_checkable class EvalTasks(Protocol): - @webmethod(route="/eval-tasks", method="GET") - async def DEPRECATED_list_eval_tasks_deprecated( - self, - ) -> ListEvalTasksResponse: ... - @webmethod(route="/eval/tasks", method="GET") async def list_eval_tasks(self) -> ListEvalTasksResponse: ... @@ -69,3 +64,23 @@ async def register_eval_task( provider_id: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, ) -> None: ... + + @webmethod(route="/eval-tasks", method="GET") + async def DEPRECATED_list_eval_tasks(self) -> ListEvalTasksResponse: ... + + @webmethod(route="/eval-tasks/{eval_task_id}", method="GET") + async def DEPRECATED_get_eval_task( + self, + eval_task_id: str, + ) -> Optional[EvalTask]: ... + + @webmethod(route="/eval-tasks", method="POST") + async def DEPRECATED_register_eval_task( + self, + eval_task_id: str, + dataset_id: str, + scoring_functions: List[str], + provider_eval_task_id: Optional[str] = None, + provider_id: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> None: ... diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index 0664e310a5..98e3afd3ff 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -465,6 +465,26 @@ async def register_eval_task( ) await self.register_object(eval_task) + async def DEPRECATED_list_eval_tasks(self) -> ListEvalTasksResponse: + raise DeprecationWarning("Use /eval/tasks instead") + + async def DEPRECATED_get_eval_task( + self, + eval_task_id: str, + ) -> Optional[EvalTask]: + raise DeprecationWarning("Use /eval/tasks instead") + + async def DEPRECATED_register_eval_task( + self, + eval_task_id: str, + dataset_id: str, + scoring_functions: List[str], + provider_eval_task_id: Optional[str] = None, + provider_id: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> None: + raise DeprecationWarning("Use /eval/tasks instead") + class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups): async def list_tools(self, toolgroup_id: Optional[str] = None) -> ListToolsResponse: From 9a8f4025c1296b02db74fdd48a9d2ac6afe77348 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Feb 2025 20:29:36 -0800 Subject: [PATCH 07/31] naming update --- docs/_static/llama-stack-spec.html | 72 +++++++++---------- docs/_static/llama-stack-spec.yaml | 72 +++++++++---------- docs/getting_started.ipynb | 4 +- .../Llama_Stack_Benchmark_Evals.ipynb | 12 ++-- docs/source/building_applications/evals.md | 8 +-- .../building_applications/evaluation.md | 4 +- docs/source/concepts/evaluation_concepts.md | 4 +- docs/source/concepts/index.md | 2 +- docs/source/playground/index.md | 4 +- .../references/evals_reference/index.md | 24 +++---- .../llama_stack_client_cli_reference.md | 10 +-- .../references/python_sdk_reference/index.md | 14 ++-- llama_stack/apis/datatypes.py | 2 +- llama_stack/apis/eval/eval.py | 14 ++-- llama_stack/apis/eval_tasks/__init__.py | 2 +- llama_stack/apis/eval_tasks/eval_tasks.py | 44 ++++++------ llama_stack/apis/resource.py | 2 +- llama_stack/distribution/datatypes.py | 9 +-- llama_stack/distribution/distribution.py | 2 +- llama_stack/distribution/resolver.py | 8 +-- llama_stack/distribution/routers/__init__.py | 4 +- llama_stack/distribution/routers/routers.py | 8 +-- .../distribution/routers/routing_tables.py | 49 ++++++------- llama_stack/distribution/stack.py | 6 +- llama_stack/distribution/ui/README.md | 2 +- .../ui/page/distribution/eval_tasks.py | 10 +-- .../ui/page/distribution/resources.py | 4 +- .../ui/page/evaluations/native_eval.py | 44 ++++++------ llama_stack/providers/datatypes.py | 7 +- .../inline/eval/meta_reference/eval.py | 36 +++++----- llama_stack/providers/tests/eval/test_eval.py | 42 +++++------ llama_stack/providers/tests/resolver.py | 7 +- llama_stack/templates/bedrock/run.yaml | 2 +- llama_stack/templates/cerebras/run.yaml | 2 +- .../templates/dell/run-with-safety.yaml | 2 +- llama_stack/templates/dell/run.yaml | 2 +- .../experimental-post-training/run.yaml | 2 +- .../templates/fireworks/run-with-safety.yaml | 2 +- llama_stack/templates/fireworks/run.yaml | 2 +- .../hf-endpoint/run-with-safety.yaml | 2 +- llama_stack/templates/hf-endpoint/run.yaml | 2 +- .../hf-serverless/run-with-safety.yaml | 2 +- llama_stack/templates/hf-serverless/run.yaml | 2 +- .../meta-reference-gpu/run-with-safety.yaml | 2 +- .../templates/meta-reference-gpu/run.yaml | 2 +- .../meta-reference-quantized-gpu/run.yaml | 2 +- llama_stack/templates/nvidia/run.yaml | 2 +- .../templates/ollama/run-with-safety.yaml | 2 +- llama_stack/templates/ollama/run.yaml | 2 +- .../remote-vllm/run-with-safety.yaml | 2 +- llama_stack/templates/remote-vllm/run.yaml | 2 +- llama_stack/templates/sambanova/run.yaml | 2 +- .../templates/tgi/run-with-safety.yaml | 2 +- llama_stack/templates/tgi/run.yaml | 2 +- .../templates/together/run-with-safety.yaml | 2 +- llama_stack/templates/together/run.yaml | 2 +- llama_stack/templates/vllm-gpu/run.yaml | 2 +- 57 files changed, 293 insertions(+), 289 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 188ba96a4a..84c6fd99df 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -40,7 +40,7 @@ } ], "paths": { - "/v1/eval-tasks/{eval_task_id}": { + "/v1/eval-tasks/{benchmark_id}": { "get": { "responses": { "200": { @@ -50,7 +50,7 @@ "schema": { "oneOf": [ { - "$ref": "#/components/schemas/EvalTask" + "$ref": "#/components/schemas/Benchmark" }, { "type": "null" @@ -62,12 +62,12 @@ } }, "tags": [ - "EvalTasks" + "Benchmarks" ], "description": "", "parameters": [ { - "name": "eval_task_id", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -86,14 +86,14 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ListEvalTasksResponse" + "$ref": "#/components/schemas/ListBenchmarksResponse" } } } } }, "tags": [ - "EvalTasks" + "Benchmarks" ], "description": "", "parameters": [], @@ -106,7 +106,7 @@ } }, "tags": [ - "EvalTasks" + "Benchmarks" ], "description": "", "parameters": [], @@ -114,7 +114,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/DeprecatedRegisterEvalTaskRequest" + "$ref": "#/components/schemas/DeprecatedRegisterBenchmarkRequest" } } }, @@ -821,7 +821,7 @@ "schema": { "oneOf": [ { - "$ref": "#/components/schemas/EvalTask" + "$ref": "#/components/schemas/Benchmark" }, { "type": "null" @@ -833,7 +833,7 @@ } }, "tags": [ - "EvalTasks" + "Benchmarks" ], "description": "", "parameters": [ @@ -1594,14 +1594,14 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ListEvalTasksResponse" + "$ref": "#/components/schemas/ListBenchmarksResponse" } } } } }, "tags": [ - "EvalTasks" + "Benchmarks" ], "description": "", "parameters": [] @@ -1613,7 +1613,7 @@ } }, "tags": [ - "EvalTasks" + "Benchmarks" ], "description": "", "parameters": [], @@ -1621,7 +1621,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/RegisterEvalTaskRequest" + "$ref": "#/components/schemas/RegisterBenchmarkRequest" } } }, @@ -2448,7 +2448,7 @@ "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema", "components": { "schemas": { - "EvalTask": { + "Benchmark": { "type": "object", "properties": { "identifier": { @@ -2462,8 +2462,8 @@ }, "type": { "type": "string", - "const": "eval_task", - "default": "eval_task" + "const": "benchmark", + "default": "benchmark" }, "dataset_id": { "type": "string" @@ -2511,13 +2511,13 @@ "metadata" ] }, - "ListEvalTasksResponse": { + "ListBenchmarksResponse": { "type": "object", "properties": { "data": { "type": "array", "items": { - "$ref": "#/components/schemas/EvalTask" + "$ref": "#/components/schemas/Benchmark" } } }, @@ -2526,10 +2526,10 @@ "data" ] }, - "DeprecatedRegisterEvalTaskRequest": { + "DeprecatedRegisterBenchmarkRequest": { "type": "object", "properties": { - "eval_task_id": { + "benchmark_id": { "type": "string" }, "dataset_id": { @@ -2541,7 +2541,7 @@ "type": "string" } }, - "provider_eval_task_id": { + "provider_benchmark_id": { "type": "string" }, "provider_id": { @@ -2575,7 +2575,7 @@ }, "additionalProperties": false, "required": [ - "eval_task_id", + "benchmark_id", "dataset_id", "scoring_functions" ] @@ -4745,7 +4745,7 @@ "accuracy" ] }, - "AppEvalTaskConfig": { + "AppBenchmarkConfig": { "type": "object", "properties": { "type": { @@ -4793,7 +4793,7 @@ "type" ] }, - "BenchmarkEvalTaskConfig": { + "BenchmarkBenchmarkConfig": { "type": "object", "properties": { "type": { @@ -4831,20 +4831,20 @@ } } }, - "EvalTaskConfig": { + "BenchmarkConfig": { "oneOf": [ { - "$ref": "#/components/schemas/BenchmarkEvalTaskConfig" + "$ref": "#/components/schemas/BenchmarkBenchmarkConfig" }, { - "$ref": "#/components/schemas/AppEvalTaskConfig" + "$ref": "#/components/schemas/AppBenchmarkConfig" } ], "discriminator": { "propertyName": "type", "mapping": { - "benchmark": "#/components/schemas/BenchmarkEvalTaskConfig", - "app": "#/components/schemas/AppEvalTaskConfig" + "benchmark": "#/components/schemas/BenchmarkBenchmarkConfig", + "app": "#/components/schemas/AppBenchmarkConfig" } } }, @@ -4991,7 +4991,7 @@ } }, "task_config": { - "$ref": "#/components/schemas/EvalTaskConfig" + "$ref": "#/components/schemas/BenchmarkConfig" } }, "additionalProperties": false, @@ -7358,7 +7358,7 @@ "url" ] }, - "RegisterEvalTaskRequest": { + "RegisterBenchmarkRequest": { "type": "object", "properties": { "task_id": { @@ -7373,7 +7373,7 @@ "type": "string" } }, - "provider_eval_task_id": { + "provider_benchmark_id": { "type": "string" }, "provider_id": { @@ -7603,7 +7603,7 @@ "type": "object", "properties": { "task_config": { - "$ref": "#/components/schemas/EvalTaskConfig" + "$ref": "#/components/schemas/BenchmarkConfig" } }, "additionalProperties": false, @@ -8115,7 +8115,7 @@ "name": "Eval" }, { - "name": "EvalTasks" + "name": "Benchmarks" }, { "name": "Inference", @@ -8171,7 +8171,7 @@ "DatasetIO", "Datasets", "Eval", - "EvalTasks", + "Benchmarks", "Inference", "Inspect", "Models", diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index ed5b71d0d8..dd0951fdec 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -10,7 +10,7 @@ info: servers: - url: http://any-hosted-llama-stack.com paths: - /v1/eval-tasks/{eval_task_id}: + /v1/eval-tasks/{benchmark_id}: get: responses: '200': @@ -19,13 +19,13 @@ paths: application/json: schema: oneOf: - - $ref: '#/components/schemas/EvalTask' + - $ref: '#/components/schemas/Benchmark' - type: 'null' tags: - - EvalTasks + - Benchmarks description: '' parameters: - - name: eval_task_id + - name: benchmark_id in: path required: true schema: @@ -39,9 +39,9 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/ListEvalTasksResponse' + $ref: '#/components/schemas/ListBenchmarksResponse' tags: - - EvalTasks + - Benchmarks description: '' parameters: [] deprecated: true @@ -50,14 +50,14 @@ paths: '200': description: OK tags: - - EvalTasks + - Benchmarks description: '' parameters: [] requestBody: content: application/json: schema: - $ref: '#/components/schemas/DeprecatedRegisterEvalTaskRequest' + $ref: '#/components/schemas/DeprecatedRegisterBenchmarkRequest' required: true deprecated: true /v1/datasetio/rows: @@ -499,10 +499,10 @@ paths: application/json: schema: oneOf: - - $ref: '#/components/schemas/EvalTask' + - $ref: '#/components/schemas/Benchmark' - type: 'null' tags: - - EvalTasks + - Benchmarks description: '' parameters: - name: task_id @@ -953,9 +953,9 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/ListEvalTasksResponse' + $ref: '#/components/schemas/ListBenchmarksResponse' tags: - - EvalTasks + - Benchmarks description: '' parameters: [] post: @@ -963,14 +963,14 @@ paths: '200': description: OK tags: - - EvalTasks + - Benchmarks description: '' parameters: [] requestBody: content: application/json: schema: - $ref: '#/components/schemas/RegisterEvalTaskRequest' + $ref: '#/components/schemas/RegisterBenchmarkRequest' required: true /v1/models: get: @@ -1479,7 +1479,7 @@ jsonSchemaDialect: >- https://json-schema.org/draft/2020-12/schema components: schemas: - EvalTask: + Benchmark: type: object properties: identifier: @@ -1490,8 +1490,8 @@ components: type: string type: type: string - const: eval_task - default: eval_task + const: benchmark + default: benchmark dataset_id: type: string scoring_functions: @@ -1517,20 +1517,20 @@ components: - dataset_id - scoring_functions - metadata - ListEvalTasksResponse: + ListBenchmarksResponse: type: object properties: data: type: array items: - $ref: '#/components/schemas/EvalTask' + $ref: '#/components/schemas/Benchmark' additionalProperties: false required: - data - DeprecatedRegisterEvalTaskRequest: + DeprecatedRegisterBenchmarkRequest: type: object properties: - eval_task_id: + benchmark_id: type: string dataset_id: type: string @@ -1538,7 +1538,7 @@ components: type: array items: type: string - provider_eval_task_id: + provider_benchmark_id: type: string provider_id: type: string @@ -1554,7 +1554,7 @@ components: - type: object additionalProperties: false required: - - eval_task_id + - benchmark_id - dataset_id - scoring_functions AppendRowsRequest: @@ -3063,7 +3063,7 @@ components: - median - categorical_count - accuracy - AppEvalTaskConfig: + AppBenchmarkConfig: type: object properties: type: @@ -3097,7 +3097,7 @@ components: additionalProperties: false required: - type - BenchmarkEvalTaskConfig: + BenchmarkBenchmarkConfig: type: object properties: type: @@ -3121,15 +3121,15 @@ components: mapping: model: '#/components/schemas/ModelCandidate' agent: '#/components/schemas/AgentCandidate' - EvalTaskConfig: + BenchmarkConfig: oneOf: - - $ref: '#/components/schemas/BenchmarkEvalTaskConfig' - - $ref: '#/components/schemas/AppEvalTaskConfig' + - $ref: '#/components/schemas/BenchmarkBenchmarkConfig' + - $ref: '#/components/schemas/AppBenchmarkConfig' discriminator: propertyName: type mapping: - benchmark: '#/components/schemas/BenchmarkEvalTaskConfig' - app: '#/components/schemas/AppEvalTaskConfig' + benchmark: '#/components/schemas/BenchmarkBenchmarkConfig' + app: '#/components/schemas/AppBenchmarkConfig' LLMAsJudgeScoringFnParams: type: object properties: @@ -3220,7 +3220,7 @@ components: items: type: string task_config: - $ref: '#/components/schemas/EvalTaskConfig' + $ref: '#/components/schemas/BenchmarkConfig' additionalProperties: false required: - input_rows @@ -4675,7 +4675,7 @@ components: - dataset_id - dataset_schema - url - RegisterEvalTaskRequest: + RegisterBenchmarkRequest: type: object properties: task_id: @@ -4686,7 +4686,7 @@ components: type: array items: type: string - provider_eval_task_id: + provider_benchmark_id: type: string provider_id: type: string @@ -4815,7 +4815,7 @@ components: type: object properties: task_config: - $ref: '#/components/schemas/EvalTaskConfig' + $ref: '#/components/schemas/BenchmarkConfig' additionalProperties: false required: - task_config @@ -5128,7 +5128,7 @@ tags: - name: DatasetIO - name: Datasets - name: Eval - - name: EvalTasks + - name: Benchmarks - name: Inference description: >- This API provides the raw interface to the underlying models. Two kinds of models @@ -5162,7 +5162,7 @@ x-tagGroups: - DatasetIO - Datasets - Eval - - EvalTasks + - Benchmarks - Inference - Inspect - Models diff --git a/docs/getting_started.ipynb b/docs/getting_started.ipynb index abe537c8e1..ee616b4716 100644 --- a/docs/getting_started.ipynb +++ b/docs/getting_started.ipynb @@ -324,7 +324,7 @@ "- vector_io\n", "container_image: null\n", "datasets: []\n", - "eval_tasks: []\n", + "benchmarks: []\n", "image_name: together\n", "metadata_store:\n", " db_path: /Users/ashwin/.llama/distributions/together/registry.db\n", @@ -508,7 +508,7 @@ "- vector_io\n", "container_image: null\n", "datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", - "eval_tasks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", + "benchmarks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", "image_name: together\n", "metadata_store:\n", " db_path: \u001b[35m/Users/ashwin/.llama/distributions/together/\u001b[0m\u001b[95mregistry.db\u001b[0m\n", diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb index 84da252469..6e8480f945 100644 --- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb +++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb @@ -370,7 +370,7 @@ "- tool_runtime\n", "datasets: []\n", "container_image: null\n", - "eval_tasks: []\n", + "benchmarks: []\n", "image_name: together\n", "memory_banks: []\n", "metadata_store:\n", @@ -551,7 +551,7 @@ "- tool_runtime\n", "datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", "container_image: null\n", - "eval_tasks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", + "benchmarks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", "image_name: together\n", "memory_banks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", "metadata_store:\n", @@ -1017,8 +1017,8 @@ " \"content\": SYSTEM_PROMPT_TEMPLATE.format(subject=subset),\n", "}\n", "\n", - "client.eval_tasks.register(\n", - " eval_task_id=\"meta-reference::mmmu\",\n", + "client.benchmarks.register(\n", + " benchmark_id=\"meta-reference::mmmu\",\n", " dataset_id=f\"mmmu-{subset}-{split}\",\n", " scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n", ")\n", @@ -1196,8 +1196,8 @@ " provider_id=\"together\",\n", ")\n", "\n", - "client.eval_tasks.register(\n", - " eval_task_id=\"meta-reference::simpleqa\",\n", + "client.benchmarks.register(\n", + " benchmark_id=\"meta-reference::simpleqa\",\n", " dataset_id=simpleqa_dataset_id,\n", " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", ")\n", diff --git a/docs/source/building_applications/evals.md b/docs/source/building_applications/evals.md index c4cb476e4f..c1c371ca80 100644 --- a/docs/source/building_applications/evals.md +++ b/docs/source/building_applications/evals.md @@ -41,8 +41,8 @@ system_message = { "content": SYSTEM_PROMPT_TEMPLATE, } -client.eval_tasks.register( - eval_task_id="meta-reference::mmmu", +client.benchmarks.register( + benchmark_id="meta-reference::mmmu", dataset_id=f"mmmu-{subset}-{split}", scoring_functions=["basic::regex_parser_multiple_choice_answer"], ) @@ -99,8 +99,8 @@ eval_rows = client.datasetio.get_rows_paginated( ``` ```python -client.eval_tasks.register( - eval_task_id="meta-reference::simpleqa", +client.benchmarks.register( + benchmark_id="meta-reference::simpleqa", dataset_id=simpleqa_dataset_id, scoring_functions=["llm-as-judge::405b-simpleqa"], ) diff --git a/docs/source/building_applications/evaluation.md b/docs/source/building_applications/evaluation.md index 91e5c552bd..df18c146cc 100644 --- a/docs/source/building_applications/evaluation.md +++ b/docs/source/building_applications/evaluation.md @@ -10,8 +10,8 @@ Here's how to set up basic evaluation: ```python # Create an evaluation task -response = client.eval_tasks.register( - eval_task_id="my_eval", +response = client.benchmarks.register( + benchmark_id="my_eval", dataset_id="my_dataset", scoring_functions=["accuracy", "relevance"], ) diff --git a/docs/source/concepts/evaluation_concepts.md b/docs/source/concepts/evaluation_concepts.md index 399d99d92d..3ca4b0ac8e 100644 --- a/docs/source/concepts/evaluation_concepts.md +++ b/docs/source/concepts/evaluation_concepts.md @@ -5,7 +5,7 @@ The Llama Stack Evaluation flow allows you to run evaluations on your GenAI appl We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications. - `/datasetio` + `/datasets` API - `/scoring` + `/scoring_functions` API -- `/eval` + `/eval_tasks` API +- `/eval` + `/benchmarks` API This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing). @@ -21,7 +21,7 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo - **Scoring**: evaluate outputs of the system. - Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics. - **Eval**: generate outputs (via Inference or Agents) and perform scoring. - - Associated with `EvalTask` resource. + - Associated with `Benchmark` resource. Use the following decision tree to decide how to use LlamaStack Evaluation flow. diff --git a/docs/source/concepts/index.md b/docs/source/concepts/index.md index 1437ec6232..403e47c489 100644 --- a/docs/source/concepts/index.md +++ b/docs/source/concepts/index.md @@ -42,7 +42,7 @@ Some of these APIs are associated with a set of **Resources**. Here is the mappi - **Tool Runtime** is associated with `ToolGroup` resources. - **DatasetIO** is associated with `Dataset` resources. - **Scoring** is associated with `ScoringFunction` resources. -- **Eval** is associated with `Model` and `EvalTask` resources. +- **Eval** is associated with `Model` and `Benchmark` resources. Furthermore, we allow these resources to be **federated** across multiple providers. For example, you may have some Llama models served by Fireworks while others are served by AWS Bedrock. Regardless, they will all work seamlessly with the same uniform Inference API provided by Llama Stack. diff --git a/docs/source/playground/index.md b/docs/source/playground/index.md index d74bf1a03b..9691609abf 100644 --- a/docs/source/playground/index.md +++ b/docs/source/playground/index.md @@ -64,7 +64,7 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie ``` ```bash - $ llama-stack-client eval_tasks register \ + $ llama-stack-client benchmarks register \ --eval-task-id meta-reference-mmlu \ --provider-id meta-reference \ --dataset-id mmlu \ @@ -86,7 +86,7 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie - Under the hood, it uses Llama Stack's `/providers` API to get information about the providers. - **API Resources**: Inspect Llama Stack API resources - - This page allows you to inspect Llama Stack API resources (`models`, `datasets`, `memory_banks`, `eval_tasks`, `shields`). + - This page allows you to inspect Llama Stack API resources (`models`, `datasets`, `memory_banks`, `benchmarks`, `shields`). - Under the hood, it uses Llama Stack's `//list` API to get information about each resources. - Please visit [Core Concepts](https://llama-stack.readthedocs.io/en/latest/concepts/index.html) for more details about the resources. diff --git a/docs/source/references/evals_reference/index.md b/docs/source/references/evals_reference/index.md index 86f66208af..f0275511df 100644 --- a/docs/source/references/evals_reference/index.md +++ b/docs/source/references/evals_reference/index.md @@ -5,7 +5,7 @@ The Llama Stack Evaluation flow allows you to run evaluations on your GenAI appl We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications. - `/datasetio` + `/datasets` API - `/scoring` + `/scoring_functions` API -- `/eval` + `/eval_tasks` API +- `/eval` + `/benchmarks` API This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing). @@ -21,7 +21,7 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo - **Scoring**: evaluate outputs of the system. - Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics. - **Eval**: generate outputs (via Inference or Agents) and perform scoring. - - Associated with `EvalTask` resource. + - Associated with `Benchmark` resource. Use the following decision tree to decide how to use LlamaStack Evaluation flow. @@ -77,8 +77,8 @@ system_message = { "content": SYSTEM_PROMPT_TEMPLATE, } -client.eval_tasks.register( - eval_task_id="meta-reference::mmmu", +client.benchmarks.register( + benchmark_id="meta-reference::mmmu", dataset_id=f"mmmu-{subset}-{split}", scoring_functions=["basic::regex_parser_multiple_choice_answer"], ) @@ -135,8 +135,8 @@ eval_rows = client.datasetio.get_rows_paginated( ``` ```python -client.eval_tasks.register( - eval_task_id="meta-reference::simpleqa", +client.benchmarks.register( + benchmark_id="meta-reference::simpleqa", dataset_id=simpleqa_dataset_id, scoring_functions=["llm-as-judge::405b-simpleqa"], ) @@ -281,7 +281,7 @@ The following examples give the quick steps to start running evaluations using t #### Benchmark Evaluation CLI Usage: There are 2 inputs necessary for running a benchmark eval -- `eval-task-id`: the identifier associated with the eval task. Each `EvalTask` is parametrized by +- `eval-task-id`: the identifier associated with the eval task. Each `Benchmark` is parametrized by - `dataset_id`: the identifier associated with the dataset. - `List[scoring_function_id]`: list of scoring function identifiers. - `eval-task-config`: specifies the configuration of the model / agent to evaluate on. @@ -289,7 +289,7 @@ Usage: There are 2 inputs necessary for running a benchmark eval ``` llama-stack-client eval run_benchmark \ ---eval-task-config ~/eval_task_config.json \ +--eval-task-config ~/benchmark_config.json \ --visualize ``` @@ -309,15 +309,15 @@ llama-stack-client eval run_scoring ... --dataset-id --scoring-functions [ ...] [--provider-id ] [--provider-eval-task-id ] [--metadata ] +$ llama-stack-client benchmarks register --eval-task-id --dataset-id --scoring-functions [ ...] [--provider-id ] [--provider-eval-task-id ] [--metadata ] ``` Options: @@ -191,7 +191,7 @@ Options: - `--num-examples`: Optional. Number of examples to evaluate (useful for debugging) - `--visualize`: Optional flag. If set, visualizes evaluation results after completion -Example eval_task_config.json: +Example benchmark_config.json: ```json { "type": "benchmark", diff --git a/docs/source/references/python_sdk_reference/index.md b/docs/source/references/python_sdk_reference/index.md index 8a06e22442..eca8c58f54 100644 --- a/docs/source/references/python_sdk_reference/index.md +++ b/docs/source/references/python_sdk_reference/index.md @@ -443,20 +443,20 @@ Methods: - client.scoring_functions.list() -> ScoringFunctionListResponse - client.scoring_functions.register(\*\*params) -> None -## EvalTasks +## Benchmarks Types: ```python from llama_stack_client.types import ( - EvalTask, - ListEvalTasksResponse, - EvalTaskListResponse, + Benchmark, + ListBenchmarksResponse, + BenchmarkListResponse, ) ``` Methods: -- client.eval_tasks.retrieve(eval_task_id) -> Optional[EvalTask] -- client.eval_tasks.list() -> EvalTaskListResponse -- client.eval_tasks.register(\*\*params) -> None +- client.benchmarks.retrieve(benchmark_id) -> Optional[Benchmark] +- client.benchmarks.list() -> BenchmarkListResponse +- client.benchmarks.register(\*\*params) -> None diff --git a/llama_stack/apis/datatypes.py b/llama_stack/apis/datatypes.py index ccc395b80b..0751b2c9b2 100644 --- a/llama_stack/apis/datatypes.py +++ b/llama_stack/apis/datatypes.py @@ -28,7 +28,7 @@ class Api(Enum): vector_dbs = "vector_dbs" datasets = "datasets" scoring_functions = "scoring_functions" - eval_tasks = "eval_tasks" + benchmarks = "benchmarks" tool_groups = "tool_groups" # built-in API diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py index ae13a5bd95..16b96d618b 100644 --- a/llama_stack/apis/eval/eval.py +++ b/llama_stack/apis/eval/eval.py @@ -38,7 +38,7 @@ class AgentCandidate(BaseModel): @json_schema_type -class BenchmarkEvalTaskConfig(BaseModel): +class BenchmarkBenchmarkConfig(BaseModel): type: Literal["benchmark"] = "benchmark" eval_candidate: EvalCandidate num_examples: Optional[int] = Field( @@ -48,7 +48,7 @@ class BenchmarkEvalTaskConfig(BaseModel): @json_schema_type -class AppEvalTaskConfig(BaseModel): +class AppBenchmarkConfig(BaseModel): type: Literal["app"] = "app" eval_candidate: EvalCandidate scoring_params: Dict[str, ScoringFnParams] = Field( @@ -62,9 +62,9 @@ class AppEvalTaskConfig(BaseModel): # we could optinally add any specific dataset config here -EvalTaskConfig = register_schema( - Annotated[Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type")], - name="EvalTaskConfig", +BenchmarkConfig = register_schema( + Annotated[Union[BenchmarkBenchmarkConfig, AppBenchmarkConfig], Field(discriminator="type")], + name="BenchmarkConfig", ) @@ -80,7 +80,7 @@ class Eval(Protocol): async def run_eval( self, task_id: str, - task_config: EvalTaskConfig, + task_config: BenchmarkConfig, ) -> Job: ... @webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST") @@ -89,7 +89,7 @@ async def evaluate_rows( task_id: str, input_rows: List[Dict[str, Any]], scoring_functions: List[str], - task_config: EvalTaskConfig, + task_config: BenchmarkConfig, ) -> EvaluateResponse: ... @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET") diff --git a/llama_stack/apis/eval_tasks/__init__.py b/llama_stack/apis/eval_tasks/__init__.py index 7ca2167068..f8f5649570 100644 --- a/llama_stack/apis/eval_tasks/__init__.py +++ b/llama_stack/apis/eval_tasks/__init__.py @@ -4,4 +4,4 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from .eval_tasks import * # noqa: F401 F403 +from .benchmarks import * # noqa: F401 F403 diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py index 6d12fd2f7f..7c8ed8dc04 100644 --- a/llama_stack/apis/eval_tasks/eval_tasks.py +++ b/llama_stack/apis/eval_tasks/eval_tasks.py @@ -11,7 +11,7 @@ from llama_stack.apis.resource import Resource, ResourceType -class CommonEvalTaskFields(BaseModel): +class CommonBenchmarkFields(BaseModel): dataset_id: str scoring_functions: List[str] metadata: Dict[str, Any] = Field( @@ -21,66 +21,66 @@ class CommonEvalTaskFields(BaseModel): @json_schema_type -class EvalTask(CommonEvalTaskFields, Resource): - type: Literal[ResourceType.eval_task.value] = ResourceType.eval_task.value +class Benchmark(CommonBenchmarkFields, Resource): + type: Literal[ResourceType.benchmark.value] = ResourceType.benchmark.value @property def task_id(self) -> str: return self.identifier @property - def provider_eval_task_id(self) -> str: + def provider_benchmark_id(self) -> str: return self.provider_resource_id -class EvalTaskInput(CommonEvalTaskFields, BaseModel): +class BenchmarkInput(CommonBenchmarkFields, BaseModel): task_id: str provider_id: Optional[str] = None - provider_eval_task_id: Optional[str] = None + provider_benchmark_id: Optional[str] = None -class ListEvalTasksResponse(BaseModel): - data: List[EvalTask] +class ListBenchmarksResponse(BaseModel): + data: List[Benchmark] @runtime_checkable -class EvalTasks(Protocol): +class Benchmarks(Protocol): @webmethod(route="/eval/tasks", method="GET") - async def list_eval_tasks(self) -> ListEvalTasksResponse: ... + async def list_benchmarks(self) -> ListBenchmarksResponse: ... @webmethod(route="/eval/tasks/{task_id}", method="GET") - async def get_eval_task( + async def get_benchmark( self, task_id: str, - ) -> Optional[EvalTask]: ... + ) -> Optional[Benchmark]: ... @webmethod(route="/eval/tasks", method="POST") - async def register_eval_task( + async def register_benchmark( self, task_id: str, dataset_id: str, scoring_functions: List[str], - provider_eval_task_id: Optional[str] = None, + provider_benchmark_id: Optional[str] = None, provider_id: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, ) -> None: ... @webmethod(route="/eval-tasks", method="GET") - async def DEPRECATED_list_eval_tasks(self) -> ListEvalTasksResponse: ... + async def DEPRECATED_list_benchmarks(self) -> ListBenchmarksResponse: ... - @webmethod(route="/eval-tasks/{eval_task_id}", method="GET") - async def DEPRECATED_get_eval_task( + @webmethod(route="/eval-tasks/{benchmark_id}", method="GET") + async def DEPRECATED_get_benchmark( self, - eval_task_id: str, - ) -> Optional[EvalTask]: ... + benchmark_id: str, + ) -> Optional[Benchmark]: ... @webmethod(route="/eval-tasks", method="POST") - async def DEPRECATED_register_eval_task( + async def DEPRECATED_register_benchmark( self, - eval_task_id: str, + benchmark_id: str, dataset_id: str, scoring_functions: List[str], - provider_eval_task_id: Optional[str] = None, + provider_benchmark_id: Optional[str] = None, provider_id: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, ) -> None: ... diff --git a/llama_stack/apis/resource.py b/llama_stack/apis/resource.py index 145113a5d6..70ec63c55d 100644 --- a/llama_stack/apis/resource.py +++ b/llama_stack/apis/resource.py @@ -15,7 +15,7 @@ class ResourceType(Enum): vector_db = "vector_db" dataset = "dataset" scoring_function = "scoring_function" - eval_task = "eval_task" + benchmark = "benchmark" tool = "tool" tool_group = "tool_group" diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py index 97706f22a5..75ab73b9ba 100644 --- a/llama_stack/distribution/datatypes.py +++ b/llama_stack/distribution/datatypes.py @@ -8,10 +8,11 @@ from pydantic import BaseModel, Field +from llama_stack.apis.benchmarks import Benchmark, BenchmarkInput + from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Dataset, DatasetInput from llama_stack.apis.eval import Eval -from llama_stack.apis.eval_tasks import EvalTask, EvalTaskInput from llama_stack.apis.inference import Inference from llama_stack.apis.models import Model, ModelInput from llama_stack.apis.safety import Safety @@ -37,7 +38,7 @@ VectorDB, Dataset, ScoringFn, - EvalTask, + Benchmark, Tool, ToolGroup, ] @@ -50,7 +51,7 @@ VectorDB, Dataset, ScoringFn, - EvalTask, + Benchmark, Tool, ToolGroup, ], @@ -173,7 +174,7 @@ class StackRunConfig(BaseModel): vector_dbs: List[VectorDBInput] = Field(default_factory=list) datasets: List[DatasetInput] = Field(default_factory=list) scoring_fns: List[ScoringFnInput] = Field(default_factory=list) - eval_tasks: List[EvalTaskInput] = Field(default_factory=list) + benchmarks: List[BenchmarkInput] = Field(default_factory=list) tool_groups: List[ToolGroupInput] = Field(default_factory=list) server: ServerConfig = Field( diff --git a/llama_stack/distribution/distribution.py b/llama_stack/distribution/distribution.py index 2dcf38463b..384e2c3c89 100644 --- a/llama_stack/distribution/distribution.py +++ b/llama_stack/distribution/distribution.py @@ -44,7 +44,7 @@ def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]: router_api=Api.scoring, ), AutoRoutedApiInfo( - routing_table_api=Api.eval_tasks, + routing_table_api=Api.benchmarks, router_api=Api.eval, ), AutoRoutedApiInfo( diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py index 353c2971ba..0bc2e774c1 100644 --- a/llama_stack/distribution/resolver.py +++ b/llama_stack/distribution/resolver.py @@ -9,10 +9,10 @@ from typing import Any, Dict, List, Set from llama_stack.apis.agents import Agents +from llama_stack.apis.benchmarks import Benchmarks from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Datasets from llama_stack.apis.eval import Eval -from llama_stack.apis.eval_tasks import EvalTasks from llama_stack.apis.inference import Inference from llama_stack.apis.inspect import Inspect from llama_stack.apis.models import Models @@ -37,8 +37,8 @@ from llama_stack.distribution.utils.dynamic import instantiate_class_type from llama_stack.providers.datatypes import ( Api, + BenchmarksProtocolPrivate, DatasetsProtocolPrivate, - EvalTasksProtocolPrivate, InlineProviderSpec, ModelsProtocolPrivate, ProviderSpec, @@ -73,7 +73,7 @@ def api_protocol_map() -> Dict[Api, Any]: Api.scoring: Scoring, Api.scoring_functions: ScoringFunctions, Api.eval: Eval, - Api.eval_tasks: EvalTasks, + Api.benchmarks: Benchmarks, Api.post_training: PostTraining, Api.tool_groups: ToolGroups, Api.tool_runtime: ToolRuntime, @@ -92,7 +92,7 @@ def additional_protocols_map() -> Dict[Api, Any]: ScoringFunctions, Api.scoring_functions, ), - Api.eval: (EvalTasksProtocolPrivate, EvalTasks, Api.eval_tasks), + Api.eval: (BenchmarksProtocolPrivate, Benchmarks, Api.benchmarks), } diff --git a/llama_stack/distribution/routers/__init__.py b/llama_stack/distribution/routers/__init__.py index 156cda3859..24defdcacb 100644 --- a/llama_stack/distribution/routers/__init__.py +++ b/llama_stack/distribution/routers/__init__.py @@ -12,8 +12,8 @@ from llama_stack.providers.datatypes import Api, RoutingTable from .routing_tables import ( + BenchmarksRoutingTable, DatasetsRoutingTable, - EvalTasksRoutingTable, ModelsRoutingTable, ScoringFunctionsRoutingTable, ShieldsRoutingTable, @@ -34,7 +34,7 @@ async def get_routing_table_impl( "shields": ShieldsRoutingTable, "datasets": DatasetsRoutingTable, "scoring_functions": ScoringFunctionsRoutingTable, - "eval_tasks": EvalTasksRoutingTable, + "benchmarks": BenchmarksRoutingTable, "tool_groups": ToolGroupsRoutingTable, } diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py index 6cddcf73cb..6697b03e26 100644 --- a/llama_stack/distribution/routers/routers.py +++ b/llama_stack/distribution/routers/routers.py @@ -9,9 +9,9 @@ from llama_stack.apis.common.content_types import InterleavedContent, URL from llama_stack.apis.datasetio import DatasetIO, PaginatedRowsResult from llama_stack.apis.eval import ( - AppEvalTaskConfig, + AppBenchmarkConfig, + BenchmarkConfig, Eval, - EvalTaskConfig, EvaluateResponse, Job, JobStatus, @@ -348,7 +348,7 @@ async def shutdown(self) -> None: async def run_eval( self, task_id: str, - task_config: AppEvalTaskConfig, + task_config: AppBenchmarkConfig, ) -> Job: return await self.routing_table.get_provider_impl(task_id).run_eval( task_id=task_id, @@ -360,7 +360,7 @@ async def evaluate_rows( task_id: str, input_rows: List[Dict[str, Any]], scoring_functions: List[str], - task_config: EvalTaskConfig, + task_config: BenchmarkConfig, ) -> EvaluateResponse: return await self.routing_table.get_provider_impl(task_id).evaluate_rows( task_id=task_id, diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index 98e3afd3ff..6c1b06ed6b 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -8,10 +8,11 @@ from pydantic import TypeAdapter +from llama_stack.apis.benchmarks import Benchmark, Benchmarks, ListBenchmarksResponse + from llama_stack.apis.common.content_types import URL from llama_stack.apis.common.type_system import ParamType from llama_stack.apis.datasets import Dataset, Datasets, ListDatasetsResponse -from llama_stack.apis.eval_tasks import EvalTask, EvalTasks, ListEvalTasksResponse from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType from llama_stack.apis.resource import ResourceType from llama_stack.apis.scoring_functions import ( @@ -60,7 +61,7 @@ async def register_object_with_provider(obj: RoutableObject, p: Any) -> Routable elif api == Api.scoring: return await p.register_scoring_function(obj) elif api == Api.eval: - return await p.register_eval_task(obj) + return await p.register_benchmark(obj) elif api == Api.tool_runtime: return await p.register_tool(obj) else: @@ -121,7 +122,7 @@ async def add_objects(objs: List[RoutableObjectWithProvider], provider_id: str, scoring_functions = await p.list_scoring_functions() await add_objects(scoring_functions, pid, ScoringFn) elif api == Api.eval: - p.eval_task_store = self + p.benchmark_store = self elif api == Api.tool_runtime: p.tool_store = self @@ -141,8 +142,8 @@ def apiname_object(): return ("DatasetIO", "dataset") elif isinstance(self, ScoringFunctionsRoutingTable): return ("Scoring", "scoring_function") - elif isinstance(self, EvalTasksRoutingTable): - return ("Eval", "eval_task") + elif isinstance(self, BenchmarksRoutingTable): + return ("Eval", "benchmark") elif isinstance(self, ToolGroupsRoutingTable): return ("Tools", "tool") else: @@ -428,20 +429,20 @@ async def register_scoring_function( await self.register_object(scoring_fn) -class EvalTasksRoutingTable(CommonRoutingTableImpl, EvalTasks): - async def list_eval_tasks(self) -> ListEvalTasksResponse: - return ListEvalTasksResponse(data=await self.get_all_with_type("eval_task")) +class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks): + async def list_benchmarks(self) -> ListBenchmarksResponse: + return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark")) - async def get_eval_task(self, task_id: str) -> Optional[EvalTask]: - return await self.get_object_by_identifier("eval_task", task_id) + async def get_benchmark(self, task_id: str) -> Optional[Benchmark]: + return await self.get_object_by_identifier("benchmark", task_id) - async def register_eval_task( + async def register_benchmark( self, task_id: str, dataset_id: str, scoring_functions: List[str], metadata: Optional[Dict[str, Any]] = None, - provider_eval_task_id: Optional[str] = None, + provider_benchmark_id: Optional[str] = None, provider_id: Optional[str] = None, ) -> None: if metadata is None: @@ -453,33 +454,33 @@ async def register_eval_task( raise ValueError( "No provider specified and multiple providers available. Please specify a provider_id." ) - if provider_eval_task_id is None: - provider_eval_task_id = task_id - eval_task = EvalTask( + if provider_benchmark_id is None: + provider_benchmark_id = task_id + benchmark = Benchmark( identifier=task_id, dataset_id=dataset_id, scoring_functions=scoring_functions, metadata=metadata, provider_id=provider_id, - provider_resource_id=provider_eval_task_id, + provider_resource_id=provider_benchmark_id, ) - await self.register_object(eval_task) + await self.register_object(benchmark) - async def DEPRECATED_list_eval_tasks(self) -> ListEvalTasksResponse: + async def DEPRECATED_list_benchmarks(self) -> ListBenchmarksResponse: raise DeprecationWarning("Use /eval/tasks instead") - async def DEPRECATED_get_eval_task( + async def DEPRECATED_get_benchmark( self, - eval_task_id: str, - ) -> Optional[EvalTask]: + benchmark_id: str, + ) -> Optional[Benchmark]: raise DeprecationWarning("Use /eval/tasks instead") - async def DEPRECATED_register_eval_task( + async def DEPRECATED_register_benchmark( self, - eval_task_id: str, + benchmark_id: str, dataset_id: str, scoring_functions: List[str], - provider_eval_task_id: Optional[str] = None, + provider_benchmark_id: Optional[str] = None, provider_id: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, ) -> None: diff --git a/llama_stack/distribution/stack.py b/llama_stack/distribution/stack.py index 2baad8ac45..9335dc3a95 100644 --- a/llama_stack/distribution/stack.py +++ b/llama_stack/distribution/stack.py @@ -15,10 +15,10 @@ from llama_stack.apis.agents import Agents from llama_stack.apis.batch_inference import BatchInference +from llama_stack.apis.benchmarks import Benchmarks from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Datasets from llama_stack.apis.eval import Eval -from llama_stack.apis.eval_tasks import EvalTasks from llama_stack.apis.inference import Inference from llama_stack.apis.inspect import Inspect from llama_stack.apis.models import Models @@ -53,7 +53,7 @@ class LlamaStack( PostTraining, VectorIO, Eval, - EvalTasks, + Benchmarks, Scoring, ScoringFunctions, DatasetIO, @@ -78,7 +78,7 @@ class LlamaStack( "register_scoring_function", "list_scoring_functions", ), - ("eval_tasks", Api.eval_tasks, "register_eval_task", "list_eval_tasks"), + ("benchmarks", Api.benchmarks, "register_benchmark", "list_benchmarks"), ("tool_groups", Api.tool_groups, "register_tool_group", "list_tool_groups"), ] diff --git a/llama_stack/distribution/ui/README.md b/llama_stack/distribution/ui/README.md index c0a2597af5..8fceb5c63c 100644 --- a/llama_stack/distribution/ui/README.md +++ b/llama_stack/distribution/ui/README.md @@ -26,7 +26,7 @@ $ llama-stack-client datasets register \ ``` ```bash -$ llama-stack-client eval_tasks register \ +$ llama-stack-client benchmarks register \ --eval-task-id meta-reference-mmlu \ --provider-id meta-reference \ --dataset-id mmlu \ diff --git a/llama_stack/distribution/ui/page/distribution/eval_tasks.py b/llama_stack/distribution/ui/page/distribution/eval_tasks.py index f589696631..b83023901d 100644 --- a/llama_stack/distribution/ui/page/distribution/eval_tasks.py +++ b/llama_stack/distribution/ui/page/distribution/eval_tasks.py @@ -8,12 +8,12 @@ from modules.api import llama_stack_api -def eval_tasks(): +def benchmarks(): # Eval Tasks Section st.header("Eval Tasks") - eval_tasks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.eval_tasks.list()} + benchmarks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.benchmarks.list()} - if len(eval_tasks_info) > 0: - selected_eval_task = st.selectbox("Select an eval task", list(eval_tasks_info.keys()), key="eval_task_inspect") - st.json(eval_tasks_info[selected_eval_task], expanded=True) + if len(benchmarks_info) > 0: + selected_benchmark = st.selectbox("Select an eval task", list(benchmarks_info.keys()), key="benchmark_inspect") + st.json(benchmarks_info[selected_benchmark], expanded=True) diff --git a/llama_stack/distribution/ui/page/distribution/resources.py b/llama_stack/distribution/ui/page/distribution/resources.py index 38d4945708..a86fda8565 100644 --- a/llama_stack/distribution/ui/page/distribution/resources.py +++ b/llama_stack/distribution/ui/page/distribution/resources.py @@ -4,8 +4,8 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from page.distribution.benchmarks import benchmarks from page.distribution.datasets import datasets -from page.distribution.eval_tasks import eval_tasks from page.distribution.models import models from page.distribution.scoring_functions import scoring_functions from page.distribution.shields import shields @@ -36,7 +36,7 @@ def resources_page(): }, ) if selected_resource == "Eval Tasks": - eval_tasks() + benchmarks() elif selected_resource == "Vector Databases": vector_dbs() elif selected_resource == "Datasets": diff --git a/llama_stack/distribution/ui/page/evaluations/native_eval.py b/llama_stack/distribution/ui/page/evaluations/native_eval.py index c4a44990f8..e24da4eb38 100644 --- a/llama_stack/distribution/ui/page/evaluations/native_eval.py +++ b/llama_stack/distribution/ui/page/evaluations/native_eval.py @@ -13,28 +13,28 @@ from modules.api import llama_stack_api -def select_eval_task_1(): +def select_benchmark_1(): # Select Eval Tasks st.subheader("1. Choose An Eval Task") - eval_tasks = llama_stack_api.client.eval_tasks.list() - eval_tasks = {et.identifier: et for et in eval_tasks} - eval_tasks_names = list(eval_tasks.keys()) - selected_eval_task = st.selectbox( + benchmarks = llama_stack_api.client.benchmarks.list() + benchmarks = {et.identifier: et for et in benchmarks} + benchmarks_names = list(benchmarks.keys()) + selected_benchmark = st.selectbox( "Choose an eval task.", - options=eval_tasks_names, + options=benchmarks_names, help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.", ) with st.expander("View Eval Task"): - st.json(eval_tasks[selected_eval_task], expanded=True) + st.json(benchmarks[selected_benchmark], expanded=True) - st.session_state["selected_eval_task"] = selected_eval_task - st.session_state["eval_tasks"] = eval_tasks + st.session_state["selected_benchmark"] = selected_benchmark + st.session_state["benchmarks"] = benchmarks if st.button("Confirm", key="confirm_1"): - st.session_state["selected_eval_task_1_next"] = True + st.session_state["selected_benchmark_1_next"] = True def define_eval_candidate_2(): - if not st.session_state.get("selected_eval_task_1_next", None): + if not st.session_state.get("selected_benchmark_1_next", None): return st.subheader("2. Define Eval Candidate") @@ -163,11 +163,11 @@ def run_evaluation_3(): Review the configurations that will be used for this evaluation run, make any necessary changes, and then click the "Run Evaluation" button. """ ) - selected_eval_task = st.session_state["selected_eval_task"] - eval_tasks = st.session_state["eval_tasks"] + selected_benchmark = st.session_state["selected_benchmark"] + benchmarks = st.session_state["benchmarks"] eval_candidate = st.session_state["eval_candidate"] - dataset_id = eval_tasks[selected_eval_task].dataset_id + dataset_id = benchmarks[selected_benchmark].dataset_id rows = llama_stack_api.client.datasetio.get_rows_paginated( dataset_id=dataset_id, rows_in_page=-1, @@ -182,16 +182,16 @@ def run_evaluation_3(): help="Number of examples from the dataset to evaluate. ", ) - eval_task_config = { + benchmark_config = { "type": "benchmark", "eval_candidate": eval_candidate, "scoring_params": {}, } with st.expander("View Evaluation Task", expanded=True): - st.json(eval_tasks[selected_eval_task], expanded=True) + st.json(benchmarks[selected_benchmark], expanded=True) with st.expander("View Evaluation Task Configuration", expanded=True): - st.json(eval_task_config, expanded=True) + st.json(benchmark_config, expanded=True) # Add run button and handle evaluation if st.button("Run Evaluation"): @@ -211,10 +211,10 @@ def run_evaluation_3(): progress_bar.progress(progress, text=progress_text) # Run evaluation for current row eval_res = llama_stack_api.client.eval.evaluate_rows( - task_id=selected_eval_task, + task_id=selected_benchmark, input_rows=[r], - scoring_functions=eval_tasks[selected_eval_task].scoring_functions, - task_config=eval_task_config, + scoring_functions=benchmarks[selected_benchmark].scoring_functions, + task_config=benchmark_config, ) for k in r.keys(): @@ -227,7 +227,7 @@ def run_evaluation_3(): output_res[k] = [] output_res[k].append(eval_res.generations[0][k]) - for scoring_fn in eval_tasks[selected_eval_task].scoring_functions: + for scoring_fn in benchmarks[selected_benchmark].scoring_functions: if scoring_fn not in output_res: output_res[scoring_fn] = [] output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0]) @@ -247,7 +247,7 @@ def native_evaluation_page(): st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙") st.title("📊 Evaluations (Generation + Scoring)") - select_eval_task_1() + select_benchmark_1() define_eval_candidate_2() run_evaluation_3() diff --git a/llama_stack/providers/datatypes.py b/llama_stack/providers/datatypes.py index d0c448f8c6..494a46b036 100644 --- a/llama_stack/providers/datatypes.py +++ b/llama_stack/providers/datatypes.py @@ -10,10 +10,11 @@ from llama_models.schema_utils import json_schema_type from pydantic import BaseModel, Field +from llama_stack.apis.benchmarks import Benchmark + from llama_stack.apis.datasets import Dataset from llama_stack.apis.datatypes import Api -from llama_stack.apis.eval_tasks import EvalTask from llama_stack.apis.models import Model from llama_stack.apis.scoring_functions import ScoringFn from llama_stack.apis.shields import Shield @@ -49,8 +50,8 @@ async def list_scoring_functions(self) -> List[ScoringFn]: ... async def register_scoring_function(self, scoring_fn: ScoringFn) -> None: ... -class EvalTasksProtocolPrivate(Protocol): - async def register_eval_task(self, eval_task: EvalTask) -> None: ... +class BenchmarksProtocolPrivate(Protocol): + async def register_benchmark(self, benchmark: Benchmark) -> None: ... class ToolsProtocolPrivate(Protocol): diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py index 1db627007a..07310f59c0 100644 --- a/llama_stack/providers/inline/eval/meta_reference/eval.py +++ b/llama_stack/providers/inline/eval/meta_reference/eval.py @@ -8,13 +8,13 @@ from tqdm import tqdm from llama_stack.apis.agents import Agents, StepType +from llama_stack.apis.benchmarks import Benchmark from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Datasets -from llama_stack.apis.eval_tasks import EvalTask from llama_stack.apis.inference import Inference, UserMessage from llama_stack.apis.scoring import Scoring from llama_stack.distribution.datatypes import Api -from llama_stack.providers.datatypes import EvalTasksProtocolPrivate +from llama_stack.providers.datatypes import BenchmarksProtocolPrivate from llama_stack.providers.inline.agents.meta_reference.agent_instance import ( MEMORY_QUERY_TOOL, @@ -27,16 +27,16 @@ from llama_stack.providers.utils.kvstore import kvstore_impl from .....apis.common.job_types import Job -from .....apis.eval.eval import Eval, EvalTaskConfig, EvaluateResponse, JobStatus +from .....apis.eval.eval import BenchmarkConfig, Eval, EvaluateResponse, JobStatus from .config import MetaReferenceEvalConfig -EVAL_TASKS_PREFIX = "eval_tasks:" +EVAL_TASKS_PREFIX = "benchmarks:" class MetaReferenceEvalImpl( Eval, - EvalTasksProtocolPrivate, + BenchmarksProtocolPrivate, ): def __init__( self, @@ -57,36 +57,36 @@ def __init__( # TODO: assume sync job, will need jobs API for async scheduling self.jobs = {} - self.eval_tasks = {} + self.benchmarks = {} async def initialize(self) -> None: self.kvstore = await kvstore_impl(self.config.kvstore) - # Load existing eval_tasks from kvstore + # Load existing benchmarks from kvstore start_key = EVAL_TASKS_PREFIX end_key = f"{EVAL_TASKS_PREFIX}\xff" - stored_eval_tasks = await self.kvstore.range(start_key, end_key) + stored_benchmarks = await self.kvstore.range(start_key, end_key) - for eval_task in stored_eval_tasks: - eval_task = EvalTask.model_validate_json(eval_task) - self.eval_tasks[eval_task.identifier] = eval_task + for benchmark in stored_benchmarks: + benchmark = Benchmark.model_validate_json(benchmark) + self.benchmarks[benchmark.identifier] = benchmark async def shutdown(self) -> None: ... - async def register_eval_task(self, task_def: EvalTask) -> None: + async def register_benchmark(self, task_def: Benchmark) -> None: # Store in kvstore key = f"{EVAL_TASKS_PREFIX}{task_def.identifier}" await self.kvstore.set( key=key, value=task_def.model_dump_json(), ) - self.eval_tasks[task_def.identifier] = task_def + self.benchmarks[task_def.identifier] = task_def async def run_eval( self, task_id: str, - task_config: EvalTaskConfig, + task_config: BenchmarkConfig, ) -> Job: - task_def = self.eval_tasks[task_id] + task_def = self.benchmarks[task_id] dataset_id = task_def.dataset_id candidate = task_config.eval_candidate scoring_functions = task_def.scoring_functions @@ -110,7 +110,7 @@ async def run_eval( return Job(job_id=job_id) async def _run_agent_generation( - self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig + self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig ) -> List[Dict[str, Any]]: candidate = task_config.eval_candidate create_response = await self.agents_api.create_agent(candidate.config) @@ -153,7 +153,7 @@ async def _run_agent_generation( return generations async def _run_model_generation( - self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig + self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig ) -> List[Dict[str, Any]]: candidate = task_config.eval_candidate assert candidate.sampling_params.max_tokens is not None, "SamplingParams.max_tokens must be provided" @@ -192,7 +192,7 @@ async def evaluate_rows( task_id: str, input_rows: List[Dict[str, Any]], scoring_functions: List[str], - task_config: EvalTaskConfig, + task_config: BenchmarkConfig, ) -> EvaluateResponse: candidate = task_config.eval_candidate if candidate.type == "agent": diff --git a/llama_stack/providers/tests/eval/test_eval.py b/llama_stack/providers/tests/eval/test_eval.py index 40835bf53d..78351a28ef 100644 --- a/llama_stack/providers/tests/eval/test_eval.py +++ b/llama_stack/providers/tests/eval/test_eval.py @@ -11,8 +11,8 @@ from llama_stack.apis.common.type_system import ChatCompletionInputType, StringType from llama_stack.apis.eval.eval import ( - AppEvalTaskConfig, - BenchmarkEvalTaskConfig, + AppBenchmarkConfig, + BenchmarkBenchmarkConfig, ModelCandidate, ) from llama_stack.apis.inference import SamplingParams @@ -30,18 +30,18 @@ class Testeval: @pytest.mark.asyncio - async def test_eval_tasks_list(self, eval_stack): + async def test_benchmarks_list(self, eval_stack): # NOTE: this needs you to ensure that you are starting from a clean state # but so far we don't have an unregister API unfortunately, so be careful - eval_tasks_impl = eval_stack[Api.eval_tasks] - response = await eval_tasks_impl.list_eval_tasks() + benchmarks_impl = eval_stack[Api.benchmarks] + response = await benchmarks_impl.list_benchmarks() assert isinstance(response, list) @pytest.mark.asyncio async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model): - eval_impl, eval_tasks_impl, datasetio_impl, datasets_impl, models_impl = ( + eval_impl, benchmarks_impl, datasetio_impl, datasets_impl, models_impl = ( eval_stack[Api.eval], - eval_stack[Api.eval_tasks], + eval_stack[Api.benchmarks], eval_stack[Api.datasetio], eval_stack[Api.datasets], eval_stack[Api.models], @@ -60,8 +60,8 @@ async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model "basic::equality", ] task_id = "meta-reference::app_eval" - await eval_tasks_impl.register_eval_task( - eval_task_id=task_id, + await benchmarks_impl.register_benchmark( + benchmark_id=task_id, dataset_id="test_dataset_for_eval", scoring_functions=scoring_functions, ) @@ -69,7 +69,7 @@ async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model task_id=task_id, input_rows=rows.rows, scoring_functions=scoring_functions, - task_config=AppEvalTaskConfig( + task_config=AppBenchmarkConfig( eval_candidate=ModelCandidate( model=inference_model, sampling_params=SamplingParams(), @@ -92,9 +92,9 @@ async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model @pytest.mark.asyncio async def test_eval_run_eval(self, eval_stack, inference_model, judge_model): - eval_impl, eval_tasks_impl, datasets_impl, models_impl = ( + eval_impl, benchmarks_impl, datasets_impl, models_impl = ( eval_stack[Api.eval], - eval_stack[Api.eval_tasks], + eval_stack[Api.benchmarks], eval_stack[Api.datasets], eval_stack[Api.models], ) @@ -106,14 +106,14 @@ async def test_eval_run_eval(self, eval_stack, inference_model, judge_model): ] task_id = "meta-reference::app_eval-2" - await eval_tasks_impl.register_eval_task( - eval_task_id=task_id, + await benchmarks_impl.register_benchmark( + benchmark_id=task_id, dataset_id="test_dataset_for_eval", scoring_functions=scoring_functions, ) response = await eval_impl.run_eval( task_id=task_id, - task_config=AppEvalTaskConfig( + task_config=AppBenchmarkConfig( eval_candidate=ModelCandidate( model=inference_model, sampling_params=SamplingParams(), @@ -131,9 +131,9 @@ async def test_eval_run_eval(self, eval_stack, inference_model, judge_model): @pytest.mark.asyncio async def test_eval_run_benchmark_eval(self, eval_stack, inference_model): - eval_impl, eval_tasks_impl, datasets_impl, models_impl = ( + eval_impl, benchmarks_impl, datasets_impl, models_impl = ( eval_stack[Api.eval], - eval_stack[Api.eval_tasks], + eval_stack[Api.benchmarks], eval_stack[Api.datasets], eval_stack[Api.models], ) @@ -159,20 +159,20 @@ async def test_eval_run_benchmark_eval(self, eval_stack, inference_model): ) # register eval task - await eval_tasks_impl.register_eval_task( - eval_task_id="meta-reference-mmlu", + await benchmarks_impl.register_benchmark( + benchmark_id="meta-reference-mmlu", dataset_id="mmlu", scoring_functions=["basic::regex_parser_multiple_choice_answer"], ) # list benchmarks - response = await eval_tasks_impl.list_eval_tasks() + response = await benchmarks_impl.list_benchmarks() assert len(response) > 0 benchmark_id = "meta-reference-mmlu" response = await eval_impl.run_eval( task_id=benchmark_id, - task_config=BenchmarkEvalTaskConfig( + task_config=BenchmarkBenchmarkConfig( eval_candidate=ModelCandidate( model=inference_model, sampling_params=SamplingParams(), diff --git a/llama_stack/providers/tests/resolver.py b/llama_stack/providers/tests/resolver.py index 0ff6327170..092514079a 100644 --- a/llama_stack/providers/tests/resolver.py +++ b/llama_stack/providers/tests/resolver.py @@ -10,8 +10,9 @@ from pydantic import BaseModel +from llama_stack.apis.benchmarks import BenchmarkInput + from llama_stack.apis.datasets import DatasetInput -from llama_stack.apis.eval_tasks import EvalTaskInput from llama_stack.apis.models import ModelInput from llama_stack.apis.scoring_functions import ScoringFnInput from llama_stack.apis.shields import ShieldInput @@ -42,7 +43,7 @@ async def construct_stack_for_test( vector_dbs: Optional[List[VectorDBInput]] = None, datasets: Optional[List[DatasetInput]] = None, scoring_fns: Optional[List[ScoringFnInput]] = None, - eval_tasks: Optional[List[EvalTaskInput]] = None, + benchmarks: Optional[List[BenchmarkInput]] = None, tool_groups: Optional[List[ToolGroupInput]] = None, ) -> TestStack: sqlite_file = tempfile.NamedTemporaryFile(delete=False, suffix=".db") @@ -56,7 +57,7 @@ async def construct_stack_for_test( vector_dbs=vector_dbs or [], datasets=datasets or [], scoring_fns=scoring_fns or [], - eval_tasks=eval_tasks or [], + benchmarks=benchmarks or [], tool_groups=tool_groups or [], ) run_config = parse_and_maybe_upgrade_config(run_config) diff --git a/llama_stack/templates/bedrock/run.yaml b/llama_stack/templates/bedrock/run.yaml index 39408c1bd7..81d8997163 100644 --- a/llama_stack/templates/bedrock/run.yaml +++ b/llama_stack/templates/bedrock/run.yaml @@ -107,7 +107,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/cerebras/run.yaml b/llama_stack/templates/cerebras/run.yaml index 5a70890a89..71003d5b0c 100644 --- a/llama_stack/templates/cerebras/run.yaml +++ b/llama_stack/templates/cerebras/run.yaml @@ -109,7 +109,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/dell/run-with-safety.yaml b/llama_stack/templates/dell/run-with-safety.yaml index bdc82d03a9..493beeb0d7 100644 --- a/llama_stack/templates/dell/run-with-safety.yaml +++ b/llama_stack/templates/dell/run-with-safety.yaml @@ -108,7 +108,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: brave-search diff --git a/llama_stack/templates/dell/run.yaml b/llama_stack/templates/dell/run.yaml index 2ba62a7821..cb045c714b 100644 --- a/llama_stack/templates/dell/run.yaml +++ b/llama_stack/templates/dell/run.yaml @@ -99,7 +99,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: brave-search diff --git a/llama_stack/templates/experimental-post-training/run.yaml b/llama_stack/templates/experimental-post-training/run.yaml index 75d103c9fa..e70ccdd2de 100644 --- a/llama_stack/templates/experimental-post-training/run.yaml +++ b/llama_stack/templates/experimental-post-training/run.yaml @@ -85,4 +85,4 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] diff --git a/llama_stack/templates/fireworks/run-with-safety.yaml b/llama_stack/templates/fireworks/run-with-safety.yaml index a4b425436f..3bad366d1c 100644 --- a/llama_stack/templates/fireworks/run-with-safety.yaml +++ b/llama_stack/templates/fireworks/run-with-safety.yaml @@ -164,7 +164,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml index a497317bde..e60fdecb2f 100644 --- a/llama_stack/templates/fireworks/run.yaml +++ b/llama_stack/templates/fireworks/run.yaml @@ -153,7 +153,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/hf-endpoint/run-with-safety.yaml b/llama_stack/templates/hf-endpoint/run-with-safety.yaml index 0329f580ba..0dfa9fade3 100644 --- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml +++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml @@ -116,7 +116,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/hf-endpoint/run.yaml b/llama_stack/templates/hf-endpoint/run.yaml index 8163fe28e6..fdb19b63f9 100644 --- a/llama_stack/templates/hf-endpoint/run.yaml +++ b/llama_stack/templates/hf-endpoint/run.yaml @@ -106,7 +106,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/hf-serverless/run-with-safety.yaml b/llama_stack/templates/hf-serverless/run-with-safety.yaml index 9cee920a5b..541d7c8645 100644 --- a/llama_stack/templates/hf-serverless/run-with-safety.yaml +++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml @@ -116,7 +116,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/hf-serverless/run.yaml b/llama_stack/templates/hf-serverless/run.yaml index c8ad0d38da..301c3c1124 100644 --- a/llama_stack/templates/hf-serverless/run.yaml +++ b/llama_stack/templates/hf-serverless/run.yaml @@ -106,7 +106,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml index 0faaabb159..7eb704e3f9 100644 --- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml +++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml @@ -118,7 +118,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/meta-reference-gpu/run.yaml b/llama_stack/templates/meta-reference-gpu/run.yaml index 6ffe1fa360..92bdbabad0 100644 --- a/llama_stack/templates/meta-reference-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-gpu/run.yaml @@ -107,7 +107,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml index 5ff87a9010..9fb506cbb2 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml @@ -109,7 +109,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml index 6dc325e9dd..6702bf6ea5 100644 --- a/llama_stack/templates/nvidia/run.yaml +++ b/llama_stack/templates/nvidia/run.yaml @@ -139,7 +139,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml index 5b5c9c253a..bc5fe4ce9b 100644 --- a/llama_stack/templates/ollama/run-with-safety.yaml +++ b/llama_stack/templates/ollama/run-with-safety.yaml @@ -113,7 +113,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml index 3cc1cb2ac6..eff648f039 100644 --- a/llama_stack/templates/ollama/run.yaml +++ b/llama_stack/templates/ollama/run.yaml @@ -102,7 +102,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml index 4a0fa9a857..97a4701aa2 100644 --- a/llama_stack/templates/remote-vllm/run-with-safety.yaml +++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml @@ -118,7 +118,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml index 9631f94a2f..1456f30880 100644 --- a/llama_stack/templates/remote-vllm/run.yaml +++ b/llama_stack/templates/remote-vllm/run.yaml @@ -107,7 +107,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/sambanova/run.yaml b/llama_stack/templates/sambanova/run.yaml index 6cec51824c..9d29ff0c9b 100644 --- a/llama_stack/templates/sambanova/run.yaml +++ b/llama_stack/templates/sambanova/run.yaml @@ -118,7 +118,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml index 503505c326..322bc455ea 100644 --- a/llama_stack/templates/tgi/run-with-safety.yaml +++ b/llama_stack/templates/tgi/run-with-safety.yaml @@ -106,7 +106,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml index f1953c513e..3cff45e4e1 100644 --- a/llama_stack/templates/tgi/run.yaml +++ b/llama_stack/templates/tgi/run.yaml @@ -105,7 +105,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/together/run-with-safety.yaml b/llama_stack/templates/together/run-with-safety.yaml index ec351108e5..39daf64810 100644 --- a/llama_stack/templates/together/run-with-safety.yaml +++ b/llama_stack/templates/together/run-with-safety.yaml @@ -159,7 +159,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml index c2afd98e9b..effd7b9cd9 100644 --- a/llama_stack/templates/together/run.yaml +++ b/llama_stack/templates/together/run.yaml @@ -148,7 +148,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml index 165e4d51db..0cc03c7eef 100644 --- a/llama_stack/templates/vllm-gpu/run.yaml +++ b/llama_stack/templates/vllm-gpu/run.yaml @@ -109,7 +109,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search From b20742fce742cf19b8293e12bff541adbb03047d Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Feb 2025 20:31:42 -0800 Subject: [PATCH 08/31] replace --- llama_stack/distribution/ui/page/distribution/eval_tasks.py | 4 ++-- llama_stack/distribution/ui/page/distribution/resources.py | 4 ++-- llama_stack/distribution/ui/page/evaluations/native_eval.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/llama_stack/distribution/ui/page/distribution/eval_tasks.py b/llama_stack/distribution/ui/page/distribution/eval_tasks.py index b83023901d..1428ae9ab2 100644 --- a/llama_stack/distribution/ui/page/distribution/eval_tasks.py +++ b/llama_stack/distribution/ui/page/distribution/eval_tasks.py @@ -9,8 +9,8 @@ def benchmarks(): - # Eval Tasks Section - st.header("Eval Tasks") + # Benchmarks Section + st.header("Benchmarks") benchmarks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.benchmarks.list()} diff --git a/llama_stack/distribution/ui/page/distribution/resources.py b/llama_stack/distribution/ui/page/distribution/resources.py index a86fda8565..c0d4aac68e 100644 --- a/llama_stack/distribution/ui/page/distribution/resources.py +++ b/llama_stack/distribution/ui/page/distribution/resources.py @@ -21,7 +21,7 @@ def resources_page(): "Shields", "Scoring Functions", "Datasets", - "Eval Tasks", + "Benchmarks", ] icons = ["magic", "memory", "shield", "file-bar-graph", "database", "list-task"] selected_resource = option_menu( @@ -35,7 +35,7 @@ def resources_page(): }, }, ) - if selected_resource == "Eval Tasks": + if selected_resource == "Benchmarks": benchmarks() elif selected_resource == "Vector Databases": vector_dbs() diff --git a/llama_stack/distribution/ui/page/evaluations/native_eval.py b/llama_stack/distribution/ui/page/evaluations/native_eval.py index e24da4eb38..39385dd140 100644 --- a/llama_stack/distribution/ui/page/evaluations/native_eval.py +++ b/llama_stack/distribution/ui/page/evaluations/native_eval.py @@ -14,7 +14,7 @@ def select_benchmark_1(): - # Select Eval Tasks + # Select Benchmarks st.subheader("1. Choose An Eval Task") benchmarks = llama_stack_api.client.benchmarks.list() benchmarks = {et.identifier: et for et in benchmarks} From 017d24fe6561005b2debdddb5935e4c475629862 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Feb 2025 20:34:32 -0800 Subject: [PATCH 09/31] replace task_id -> benchmark_id --- docs/_static/llama-stack-spec.html | 26 +++--- docs/_static/llama-stack-spec.yaml | 26 +++--- .../Llama_Stack_Benchmark_Evals.ipynb | 6 +- docs/source/building_applications/evals.md | 6 +- .../building_applications/evaluation.md | 4 +- .../references/evals_reference/index.md | 6 +- .../references/python_sdk_reference/index.md | 10 +-- llama_stack/apis/eval/eval.py | 20 ++--- llama_stack/apis/eval_tasks/__init__.py | 7 -- llama_stack/apis/eval_tasks/eval_tasks.py | 86 ------------------- llama_stack/cli/download.py | 14 +-- llama_stack/cli/verify_download.py | 4 +- llama_stack/distribution/routers/routers.py | 28 +++--- .../distribution/routers/routing_tables.py | 10 +-- .../ui/page/evaluations/native_eval.py | 2 +- .../inline/eval/meta_reference/eval.py | 16 ++-- 16 files changed, 89 insertions(+), 182 deletions(-) delete mode 100644 llama_stack/apis/eval_tasks/__init__.py delete mode 100644 llama_stack/apis/eval_tasks/eval_tasks.py diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 84c6fd99df..c656808a67 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -613,7 +613,7 @@ } } }, - "/v1/eval/tasks/{task_id}/evaluations": { + "/v1/eval/tasks/{benchmark_id}/evaluations": { "post": { "responses": { "200": { @@ -633,7 +633,7 @@ "description": "", "parameters": [ { - "name": "task_id", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -811,7 +811,7 @@ ] } }, - "/v1/eval/tasks/{task_id}": { + "/v1/eval/tasks/{benchmark_id}": { "get": { "responses": { "200": { @@ -838,7 +838,7 @@ "description": "", "parameters": [ { - "name": "task_id", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -1431,7 +1431,7 @@ } } }, - "/v1/eval/tasks/{task_id}/jobs/{job_id}": { + "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}": { "get": { "responses": { "200": { @@ -1458,7 +1458,7 @@ "description": "", "parameters": [ { - "name": "task_id", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -1487,7 +1487,7 @@ "description": "", "parameters": [ { - "name": "task_id", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -1505,7 +1505,7 @@ ] } }, - "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": { + "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result": { "get": { "responses": { "200": { @@ -1533,7 +1533,7 @@ } }, { - "name": "task_id", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -2204,7 +2204,7 @@ ] } }, - "/v1/eval/tasks/{task_id}/jobs": { + "/v1/eval/tasks/{benchmark_id}/jobs": { "post": { "responses": { "200": { @@ -2224,7 +2224,7 @@ "description": "", "parameters": [ { - "name": "task_id", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -7361,7 +7361,7 @@ "RegisterBenchmarkRequest": { "type": "object", "properties": { - "task_id": { + "benchmark_id": { "type": "string" }, "dataset_id": { @@ -7407,7 +7407,7 @@ }, "additionalProperties": false, "required": [ - "task_id", + "benchmark_id", "dataset_id", "scoring_functions" ] diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index dd0951fdec..0f0a613a81 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -372,7 +372,7 @@ paths: schema: $ref: '#/components/schemas/EmbeddingsRequest' required: true - /v1/eval/tasks/{task_id}/evaluations: + /v1/eval/tasks/{benchmark_id}/evaluations: post: responses: '200': @@ -385,7 +385,7 @@ paths: - Eval description: '' parameters: - - name: task_id + - name: benchmark_id in: path required: true schema: @@ -490,7 +490,7 @@ paths: required: true schema: type: string - /v1/eval/tasks/{task_id}: + /v1/eval/tasks/{benchmark_id}: get: responses: '200': @@ -505,7 +505,7 @@ paths: - Benchmarks description: '' parameters: - - name: task_id + - name: benchmark_id in: path required: true schema: @@ -852,7 +852,7 @@ paths: schema: $ref: '#/components/schemas/InvokeToolRequest' required: true - /v1/eval/tasks/{task_id}/jobs/{job_id}: + /v1/eval/tasks/{benchmark_id}/jobs/{job_id}: get: responses: '200': @@ -867,7 +867,7 @@ paths: - Eval description: '' parameters: - - name: task_id + - name: benchmark_id in: path required: true schema: @@ -885,7 +885,7 @@ paths: - Eval description: '' parameters: - - name: task_id + - name: benchmark_id in: path required: true schema: @@ -895,7 +895,7 @@ paths: required: true schema: type: string - /v1/eval/tasks/{task_id}/jobs/{job_id}/result: + /v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result: get: responses: '200': @@ -913,7 +913,7 @@ paths: required: true schema: type: string - - name: task_id + - name: benchmark_id in: path required: true schema: @@ -1328,7 +1328,7 @@ paths: type: array items: type: string - /v1/eval/tasks/{task_id}/jobs: + /v1/eval/tasks/{benchmark_id}/jobs: post: responses: '200': @@ -1341,7 +1341,7 @@ paths: - Eval description: '' parameters: - - name: task_id + - name: benchmark_id in: path required: true schema: @@ -4678,7 +4678,7 @@ components: RegisterBenchmarkRequest: type: object properties: - task_id: + benchmark_id: type: string dataset_id: type: string @@ -4702,7 +4702,7 @@ components: - type: object additionalProperties: false required: - - task_id + - benchmark_id - dataset_id - scoring_functions RegisterModelRequest: diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb index 6e8480f945..599df201a0 100644 --- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb +++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb @@ -1024,7 +1024,7 @@ ")\n", "\n", "response = client.eval.evaluate_rows(\n", - " task_id=\"meta-reference::mmmu\",\n", + " benchmark_id=\"meta-reference::mmmu\",\n", " input_rows=eval_rows,\n", " scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n", " task_config={\n", @@ -1203,7 +1203,7 @@ ")\n", "\n", "response = client.eval.evaluate_rows(\n", - " task_id=\"meta-reference::simpleqa\",\n", + " benchmark_id=\"meta-reference::simpleqa\",\n", " input_rows=eval_rows.rows,\n", " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", " task_config={\n", @@ -1352,7 +1352,7 @@ "}\n", "\n", "response = client.eval.evaluate_rows(\n", - " task_id=\"meta-reference::simpleqa\",\n", + " benchmark_id=\"meta-reference::simpleqa\",\n", " input_rows=eval_rows.rows,\n", " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", " task_config={\n", diff --git a/docs/source/building_applications/evals.md b/docs/source/building_applications/evals.md index c1c371ca80..f28e0d5fd7 100644 --- a/docs/source/building_applications/evals.md +++ b/docs/source/building_applications/evals.md @@ -48,7 +48,7 @@ client.benchmarks.register( ) response = client.eval.evaluate_rows( - task_id="meta-reference::mmmu", + benchmark_id="meta-reference::mmmu", input_rows=eval_rows, scoring_functions=["basic::regex_parser_multiple_choice_answer"], task_config={ @@ -106,7 +106,7 @@ client.benchmarks.register( ) response = client.eval.evaluate_rows( - task_id="meta-reference::simpleqa", + benchmark_id="meta-reference::simpleqa", input_rows=eval_rows.rows, scoring_functions=["llm-as-judge::405b-simpleqa"], task_config={ @@ -156,7 +156,7 @@ agent_config = { } response = client.eval.evaluate_rows( - task_id="meta-reference::simpleqa", + benchmark_id="meta-reference::simpleqa", input_rows=eval_rows.rows, scoring_functions=["llm-as-judge::405b-simpleqa"], task_config={ diff --git a/docs/source/building_applications/evaluation.md b/docs/source/building_applications/evaluation.md index df18c146cc..ad220f7518 100644 --- a/docs/source/building_applications/evaluation.md +++ b/docs/source/building_applications/evaluation.md @@ -18,7 +18,7 @@ response = client.benchmarks.register( # Run evaluation job = client.eval.run_eval( - task_id="my_eval", + benchmark_id="my_eval", task_config={ "type": "app", "eval_candidate": {"type": "agent", "config": agent_config}, @@ -26,5 +26,5 @@ job = client.eval.run_eval( ) # Get results -result = client.eval.job_result(task_id="my_eval", job_id=job.job_id) +result = client.eval.job_result(benchmark_id="my_eval", job_id=job.job_id) ``` diff --git a/docs/source/references/evals_reference/index.md b/docs/source/references/evals_reference/index.md index f0275511df..71dbb47e59 100644 --- a/docs/source/references/evals_reference/index.md +++ b/docs/source/references/evals_reference/index.md @@ -84,7 +84,7 @@ client.benchmarks.register( ) response = client.eval.evaluate_rows( - task_id="meta-reference::mmmu", + benchmark_id="meta-reference::mmmu", input_rows=eval_rows, scoring_functions=["basic::regex_parser_multiple_choice_answer"], task_config={ @@ -142,7 +142,7 @@ client.benchmarks.register( ) response = client.eval.evaluate_rows( - task_id="meta-reference::simpleqa", + benchmark_id="meta-reference::simpleqa", input_rows=eval_rows.rows, scoring_functions=["llm-as-judge::405b-simpleqa"], task_config={ @@ -192,7 +192,7 @@ agent_config = { } response = client.eval.evaluate_rows( - task_id="meta-reference::simpleqa", + benchmark_id="meta-reference::simpleqa", input_rows=eval_rows.rows, scoring_functions=["llm-as-judge::405b-simpleqa"], task_config={ diff --git a/docs/source/references/python_sdk_reference/index.md b/docs/source/references/python_sdk_reference/index.md index eca8c58f54..9d1130422f 100644 --- a/docs/source/references/python_sdk_reference/index.md +++ b/docs/source/references/python_sdk_reference/index.md @@ -181,8 +181,8 @@ from llama_stack_client.types import EvaluateResponse, Job Methods: -- client.eval.evaluate_rows(task_id, \*\*params) -> EvaluateResponse -- client.eval.run_eval(task_id, \*\*params) -> Job +- client.eval.evaluate_rows(benchmark_id, \*\*params) -> EvaluateResponse +- client.eval.run_eval(benchmark_id, \*\*params) -> Job ### Jobs @@ -194,9 +194,9 @@ from llama_stack_client.types.eval import JobStatusResponse Methods: -- client.eval.jobs.retrieve(job_id, \*, task_id) -> EvaluateResponse -- client.eval.jobs.cancel(job_id, \*, task_id) -> None -- client.eval.jobs.status(job_id, \*, task_id) -> Optional[JobStatusResponse] +- client.eval.jobs.retrieve(job_id, \*, benchmark_id) -> EvaluateResponse +- client.eval.jobs.cancel(job_id, \*, benchmark_id) -> None +- client.eval.jobs.status(job_id, \*, benchmark_id) -> Optional[JobStatusResponse] ## Inspect diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py index 16b96d618b..273ef657c1 100644 --- a/llama_stack/apis/eval/eval.py +++ b/llama_stack/apis/eval/eval.py @@ -76,27 +76,27 @@ class EvaluateResponse(BaseModel): class Eval(Protocol): - @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST") + @webmethod(route="/eval/tasks/{benchmark_id}/jobs", method="POST") async def run_eval( self, - task_id: str, + benchmark_id: str, task_config: BenchmarkConfig, ) -> Job: ... - @webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST") + @webmethod(route="/eval/tasks/{benchmark_id}/evaluations", method="POST") async def evaluate_rows( self, - task_id: str, + benchmark_id: str, input_rows: List[Dict[str, Any]], scoring_functions: List[str], task_config: BenchmarkConfig, ) -> EvaluateResponse: ... - @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET") - async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ... + @webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}", method="GET") + async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: ... - @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE") - async def job_cancel(self, task_id: str, job_id: str) -> None: ... + @webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}", method="DELETE") + async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ... - @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET") - async def job_result(self, job_id: str, task_id: str) -> EvaluateResponse: ... + @webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}/result", method="GET") + async def job_result(self, job_id: str, benchmark_id: str) -> EvaluateResponse: ... diff --git a/llama_stack/apis/eval_tasks/__init__.py b/llama_stack/apis/eval_tasks/__init__.py deleted file mode 100644 index f8f5649570..0000000000 --- a/llama_stack/apis/eval_tasks/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from .benchmarks import * # noqa: F401 F403 diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py deleted file mode 100644 index 7c8ed8dc04..0000000000 --- a/llama_stack/apis/eval_tasks/eval_tasks.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. -from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable - -from llama_models.schema_utils import json_schema_type, webmethod -from pydantic import BaseModel, Field - -from llama_stack.apis.resource import Resource, ResourceType - - -class CommonBenchmarkFields(BaseModel): - dataset_id: str - scoring_functions: List[str] - metadata: Dict[str, Any] = Field( - default_factory=dict, - description="Metadata for this evaluation task", - ) - - -@json_schema_type -class Benchmark(CommonBenchmarkFields, Resource): - type: Literal[ResourceType.benchmark.value] = ResourceType.benchmark.value - - @property - def task_id(self) -> str: - return self.identifier - - @property - def provider_benchmark_id(self) -> str: - return self.provider_resource_id - - -class BenchmarkInput(CommonBenchmarkFields, BaseModel): - task_id: str - provider_id: Optional[str] = None - provider_benchmark_id: Optional[str] = None - - -class ListBenchmarksResponse(BaseModel): - data: List[Benchmark] - - -@runtime_checkable -class Benchmarks(Protocol): - @webmethod(route="/eval/tasks", method="GET") - async def list_benchmarks(self) -> ListBenchmarksResponse: ... - - @webmethod(route="/eval/tasks/{task_id}", method="GET") - async def get_benchmark( - self, - task_id: str, - ) -> Optional[Benchmark]: ... - - @webmethod(route="/eval/tasks", method="POST") - async def register_benchmark( - self, - task_id: str, - dataset_id: str, - scoring_functions: List[str], - provider_benchmark_id: Optional[str] = None, - provider_id: Optional[str] = None, - metadata: Optional[Dict[str, Any]] = None, - ) -> None: ... - - @webmethod(route="/eval-tasks", method="GET") - async def DEPRECATED_list_benchmarks(self) -> ListBenchmarksResponse: ... - - @webmethod(route="/eval-tasks/{benchmark_id}", method="GET") - async def DEPRECATED_get_benchmark( - self, - benchmark_id: str, - ) -> Optional[Benchmark]: ... - - @webmethod(route="/eval-tasks", method="POST") - async def DEPRECATED_register_benchmark( - self, - benchmark_id: str, - dataset_id: str, - scoring_functions: List[str], - provider_benchmark_id: Optional[str] = None, - provider_id: Optional[str] = None, - metadata: Optional[Dict[str, Any]] = None, - ) -> None: ... diff --git a/llama_stack/cli/download.py b/llama_stack/cli/download.py index 379ac49caa..7b9b303f48 100644 --- a/llama_stack/cli/download.py +++ b/llama_stack/cli/download.py @@ -105,7 +105,7 @@ class DownloadTask: output_file: str total_size: int = 0 downloaded_size: int = 0 - task_id: Optional[int] = None + benchmark_id: Optional[int] = None retries: int = 0 max_retries: int = 3 @@ -183,8 +183,8 @@ async def _get_info(): ) # Update the progress bar's total size once we know it - if task.task_id is not None: - self.progress.update(task.task_id, total=task.total_size) + if task.benchmark_id is not None: + self.progress.update(task.benchmark_id, total=task.total_size) except httpx.HTTPError as e: self.console.print(f"[red]Error getting file info: {str(e)}[/red]") @@ -207,7 +207,7 @@ async def _download_chunk(): file.write(chunk) task.downloaded_size += len(chunk) self.progress.update( - task.task_id, + task.benchmark_id, completed=task.downloaded_size, ) @@ -234,7 +234,7 @@ async def download_file(self, task: DownloadTask) -> None: if os.path.exists(task.output_file): if self.verify_file_integrity(task): self.console.print(f"[green]Already downloaded {task.output_file}[/green]") - self.progress.update(task.task_id, completed=task.total_size) + self.progress.update(task.benchmark_id, completed=task.total_size) return await self.prepare_download(task) @@ -258,7 +258,7 @@ async def download_file(self, task: DownloadTask) -> None: raise DownloadError(f"Download failed: {str(e)}") from e except Exception as e: - self.progress.update(task.task_id, description=f"[red]Failed: {task.output_file}[/red]") + self.progress.update(task.benchmark_id, description=f"[red]Failed: {task.output_file}[/red]") raise DownloadError(f"Download failed for {task.output_file}: {str(e)}") from e def has_disk_space(self, tasks: List[DownloadTask]) -> bool: @@ -293,7 +293,7 @@ async def download_all(self, tasks: List[DownloadTask]) -> None: with self.progress: for task in tasks: desc = f"Downloading {Path(task.output_file).name}" - task.task_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size) + task.benchmark_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size) semaphore = asyncio.Semaphore(self.max_concurrent_downloads) diff --git a/llama_stack/cli/verify_download.py b/llama_stack/cli/verify_download.py index 47993c3613..ca72ca5818 100644 --- a/llama_stack/cli/verify_download.py +++ b/llama_stack/cli/verify_download.py @@ -82,7 +82,7 @@ def verify_files(model_dir: Path, checksums: Dict[str, str], console: Console) - ) as progress: for filepath, expected_hash in checksums.items(): full_path = model_dir / filepath - task_id = progress.add_task(f"Verifying {filepath}...", total=None) + benchmark_id = progress.add_task(f"Verifying {filepath}...", total=None) exists = full_path.exists() actual_hash = None @@ -102,7 +102,7 @@ def verify_files(model_dir: Path, checksums: Dict[str, str], console: Console) - ) ) - progress.remove_task(task_id) + progress.remove_task(benchmark_id) return results diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py index 6697b03e26..f9f3067670 100644 --- a/llama_stack/distribution/routers/routers.py +++ b/llama_stack/distribution/routers/routers.py @@ -347,23 +347,23 @@ async def shutdown(self) -> None: async def run_eval( self, - task_id: str, + benchmark_id: str, task_config: AppBenchmarkConfig, ) -> Job: - return await self.routing_table.get_provider_impl(task_id).run_eval( - task_id=task_id, + return await self.routing_table.get_provider_impl(benchmark_id).run_eval( + benchmark_id=benchmark_id, task_config=task_config, ) async def evaluate_rows( self, - task_id: str, + benchmark_id: str, input_rows: List[Dict[str, Any]], scoring_functions: List[str], task_config: BenchmarkConfig, ) -> EvaluateResponse: - return await self.routing_table.get_provider_impl(task_id).evaluate_rows( - task_id=task_id, + return await self.routing_table.get_provider_impl(benchmark_id).evaluate_rows( + benchmark_id=benchmark_id, input_rows=input_rows, scoring_functions=scoring_functions, task_config=task_config, @@ -371,28 +371,28 @@ async def evaluate_rows( async def job_status( self, - task_id: str, + benchmark_id: str, job_id: str, ) -> Optional[JobStatus]: - return await self.routing_table.get_provider_impl(task_id).job_status(task_id, job_id) + return await self.routing_table.get_provider_impl(benchmark_id).job_status(benchmark_id, job_id) async def job_cancel( self, - task_id: str, + benchmark_id: str, job_id: str, ) -> None: - await self.routing_table.get_provider_impl(task_id).job_cancel( - task_id, + await self.routing_table.get_provider_impl(benchmark_id).job_cancel( + benchmark_id, job_id, ) async def job_result( self, - task_id: str, + benchmark_id: str, job_id: str, ) -> EvaluateResponse: - return await self.routing_table.get_provider_impl(task_id).job_result( - task_id, + return await self.routing_table.get_provider_impl(benchmark_id).job_result( + benchmark_id, job_id, ) diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index 6c1b06ed6b..a52ab7fbdf 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -433,12 +433,12 @@ class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks): async def list_benchmarks(self) -> ListBenchmarksResponse: return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark")) - async def get_benchmark(self, task_id: str) -> Optional[Benchmark]: - return await self.get_object_by_identifier("benchmark", task_id) + async def get_benchmark(self, benchmark_id: str) -> Optional[Benchmark]: + return await self.get_object_by_identifier("benchmark", benchmark_id) async def register_benchmark( self, - task_id: str, + benchmark_id: str, dataset_id: str, scoring_functions: List[str], metadata: Optional[Dict[str, Any]] = None, @@ -455,9 +455,9 @@ async def register_benchmark( "No provider specified and multiple providers available. Please specify a provider_id." ) if provider_benchmark_id is None: - provider_benchmark_id = task_id + provider_benchmark_id = benchmark_id benchmark = Benchmark( - identifier=task_id, + identifier=benchmark_id, dataset_id=dataset_id, scoring_functions=scoring_functions, metadata=metadata, diff --git a/llama_stack/distribution/ui/page/evaluations/native_eval.py b/llama_stack/distribution/ui/page/evaluations/native_eval.py index 39385dd140..753c574a28 100644 --- a/llama_stack/distribution/ui/page/evaluations/native_eval.py +++ b/llama_stack/distribution/ui/page/evaluations/native_eval.py @@ -211,7 +211,7 @@ def run_evaluation_3(): progress_bar.progress(progress, text=progress_text) # Run evaluation for current row eval_res = llama_stack_api.client.eval.evaluate_rows( - task_id=selected_benchmark, + benchmark_id=selected_benchmark, input_rows=[r], scoring_functions=benchmarks[selected_benchmark].scoring_functions, task_config=benchmark_config, diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py index 07310f59c0..a02418e741 100644 --- a/llama_stack/providers/inline/eval/meta_reference/eval.py +++ b/llama_stack/providers/inline/eval/meta_reference/eval.py @@ -83,10 +83,10 @@ async def register_benchmark(self, task_def: Benchmark) -> None: async def run_eval( self, - task_id: str, + benchmark_id: str, task_config: BenchmarkConfig, ) -> Job: - task_def = self.benchmarks[task_id] + task_def = self.benchmarks[benchmark_id] dataset_id = task_def.dataset_id candidate = task_config.eval_candidate scoring_functions = task_def.scoring_functions @@ -97,7 +97,7 @@ async def run_eval( rows_in_page=(-1 if task_config.num_examples is None else task_config.num_examples), ) res = await self.evaluate_rows( - task_id=task_id, + benchmark_id=benchmark_id, input_rows=all_rows.rows, scoring_functions=scoring_functions, task_config=task_config, @@ -189,7 +189,7 @@ async def _run_model_generation( async def evaluate_rows( self, - task_id: str, + benchmark_id: str, input_rows: List[Dict[str, Any]], scoring_functions: List[str], task_config: BenchmarkConfig, @@ -219,17 +219,17 @@ async def evaluate_rows( return EvaluateResponse(generations=generations, scores=score_response.results) - async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: + async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: if job_id in self.jobs: return JobStatus.completed return None - async def job_cancel(self, task_id: str, job_id: str) -> None: + async def job_cancel(self, benchmark_id: str, job_id: str) -> None: raise NotImplementedError("Job cancel is not implemented yet") - async def job_result(self, task_id: str, job_id: str) -> EvaluateResponse: - status = await self.job_status(task_id, job_id) + async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: + status = await self.job_status(benchmark_id, job_id) if not status or status != JobStatus.completed: raise ValueError(f"Job is not completed, Status: {status.value}") From 8759196e29ccd3650da830e18c182e1ac1f2b37d Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Feb 2025 20:38:21 -0800 Subject: [PATCH 10/31] benchmark config --- llama_stack/apis/eval/eval.py | 31 +++++++------------------------ 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py index 273ef657c1..90b14131f4 100644 --- a/llama_stack/apis/eval/eval.py +++ b/llama_stack/apis/eval/eval.py @@ -38,18 +38,7 @@ class AgentCandidate(BaseModel): @json_schema_type -class BenchmarkBenchmarkConfig(BaseModel): - type: Literal["benchmark"] = "benchmark" - eval_candidate: EvalCandidate - num_examples: Optional[int] = Field( - description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated", - default=None, - ) - - -@json_schema_type -class AppBenchmarkConfig(BaseModel): - type: Literal["app"] = "app" +class BenchmarkConfig(BaseModel): eval_candidate: EvalCandidate scoring_params: Dict[str, ScoringFnParams] = Field( description="Map between scoring function id and parameters for each scoring function you want to run", @@ -62,12 +51,6 @@ class AppBenchmarkConfig(BaseModel): # we could optinally add any specific dataset config here -BenchmarkConfig = register_schema( - Annotated[Union[BenchmarkBenchmarkConfig, AppBenchmarkConfig], Field(discriminator="type")], - name="BenchmarkConfig", -) - - @json_schema_type class EvaluateResponse(BaseModel): generations: List[Dict[str, Any]] @@ -76,14 +59,14 @@ class EvaluateResponse(BaseModel): class Eval(Protocol): - @webmethod(route="/eval/tasks/{benchmark_id}/jobs", method="POST") + @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST") async def run_eval( self, benchmark_id: str, task_config: BenchmarkConfig, ) -> Job: ... - @webmethod(route="/eval/tasks/{benchmark_id}/evaluations", method="POST") + @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST") async def evaluate_rows( self, benchmark_id: str, @@ -92,11 +75,11 @@ async def evaluate_rows( task_config: BenchmarkConfig, ) -> EvaluateResponse: ... - @webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}", method="GET") + @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET") async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: ... - @webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}", method="DELETE") + @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE") async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ... - @webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}/result", method="GET") - async def job_result(self, job_id: str, benchmark_id: str) -> EvaluateResponse: ... + @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET") + async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ... From e07776fff618b4db2f12ff008b4f3cd51feba8c2 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Feb 2025 20:42:01 -0800 Subject: [PATCH 11/31] update --- .../distribution/routers/routing_tables.py | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index a52ab7fbdf..5d2da73372 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -4,6 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import logging from typing import Any, Dict, List, Optional from pydantic import TypeAdapter @@ -39,6 +40,8 @@ from llama_stack.distribution.store import DistributionRegistry from llama_stack.providers.datatypes import Api, RoutingTable +logger = logging.getLogger(__name__) + def get_impl_api(p: Any) -> Api: return p.__provider_spec__.api @@ -466,16 +469,18 @@ async def register_benchmark( ) await self.register_object(benchmark) - async def DEPRECATED_list_benchmarks(self) -> ListBenchmarksResponse: + async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse: + logger.warning("DEPRECATED: Use /eval/benchmarks instead") raise DeprecationWarning("Use /eval/tasks instead") - async def DEPRECATED_get_benchmark( + async def DEPRECATED_get_eval_task( self, benchmark_id: str, ) -> Optional[Benchmark]: + logger.warning("DEPRECATED: Use /eval/benchmarks instead") raise DeprecationWarning("Use /eval/tasks instead") - async def DEPRECATED_register_benchmark( + async def DEPRECATED_register_eval_task( self, benchmark_id: str, dataset_id: str, @@ -484,7 +489,14 @@ async def DEPRECATED_register_benchmark( provider_id: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, ) -> None: - raise DeprecationWarning("Use /eval/tasks instead") + logger.warning("DEPRECATED: Use /eval/benchmarks instead") + self.register_benchmark( + benchmark_id=benchmark_id, + dataset_id=dataset_id, + scoring_functions=scoring_functions, + metadata=metadata, + provider_benchmark_id=provider_benchmark_id, + ) class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups): From ec721b3867d664a486faebb6a2a2b7a77ecd0b71 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Feb 2025 20:48:05 -0800 Subject: [PATCH 12/31] update --- docs/_static/llama-stack-spec.html | 469 ++++++++++++++------ docs/_static/llama-stack-spec.yaml | 310 +++++++++---- llama_stack/apis/eval/eval.py | 25 ++ llama_stack/distribution/routers/routers.py | 3 +- 4 files changed, 584 insertions(+), 223 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index c656808a67..652dae562c 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -67,8 +67,8 @@ "description": "", "parameters": [ { - "name": "benchmark_id", - "in": "path", + "name": "task_id", + "in": "query", "required": true, "schema": { "type": "string" @@ -114,7 +114,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/DeprecatedRegisterBenchmarkRequest" + "$ref": "#/components/schemas/DeprecatedRegisterEvalTaskRequest" } } }, @@ -613,7 +613,7 @@ } } }, - "/v1/eval/tasks/{benchmark_id}/evaluations": { + "/v1/eval/benchmarks/{benchmark_id}/evaluations": { "post": { "responses": { "200": { @@ -653,6 +653,47 @@ } } }, + "/v1/eval/tasks/{task_id}/evaluations": { + "post": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluateResponse" + } + } + } + } + }, + "tags": [ + "Eval" + ], + "description": "", + "parameters": [ + { + "name": "task_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluateRowsDeprecatedRequest" + } + } + }, + "required": true + }, + "deprecated": true + } + }, "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}": { "get": { "responses": { @@ -753,7 +794,7 @@ ] } }, - "/v1/datasets/{dataset_id}": { + "/v1/eval/benchmarks/{benchmark_id}": { "get": { "responses": { "200": { @@ -763,7 +804,7 @@ "schema": { "oneOf": [ { - "$ref": "#/components/schemas/Dataset" + "$ref": "#/components/schemas/Benchmark" }, { "type": "null" @@ -775,12 +816,12 @@ } }, "tags": [ - "Datasets" + "Benchmarks" ], "description": "", "parameters": [ { - "name": "dataset_id", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -788,11 +829,27 @@ } } ] - }, - "delete": { + } + }, + "/v1/datasets/{dataset_id}": { + "get": { "responses": { "200": { - "description": "OK" + "description": "OK", + "content": { + "application/json": { + "schema": { + "oneOf": [ + { + "$ref": "#/components/schemas/Dataset" + }, + { + "type": "null" + } + ] + } + } + } } }, "tags": [ @@ -809,36 +866,20 @@ } } ] - } - }, - "/v1/eval/tasks/{benchmark_id}": { - "get": { + }, + "delete": { "responses": { "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "oneOf": [ - { - "$ref": "#/components/schemas/Benchmark" - }, - { - "type": "null" - } - ] - } - } - } + "description": "OK" } }, "tags": [ - "Benchmarks" + "Datasets" ], "description": "", "parameters": [ { - "name": "benchmark_id", + "name": "dataset_id", "in": "path", "required": true, "schema": { @@ -1431,7 +1472,7 @@ } } }, - "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}": { + "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": { "get": { "responses": { "200": { @@ -1505,7 +1546,7 @@ ] } }, - "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result": { + "/v1/eval/tasks/{task_id}/jobs/{job_id}": { "get": { "responses": { "200": { @@ -1513,7 +1554,14 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EvaluateResponse" + "oneOf": [ + { + "$ref": "#/components/schemas/JobStatus" + }, + { + "type": "null" + } + ] } } } @@ -1524,6 +1572,14 @@ ], "description": "", "parameters": [ + { + "name": "task_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + }, { "name": "job_id", "in": "path", @@ -1531,7 +1587,60 @@ "schema": { "type": "string" } + } + ], + "deprecated": true + }, + "delete": { + "responses": { + "200": { + "description": "OK" + } + }, + "tags": [ + "Eval" + ], + "description": "", + "parameters": [ + { + "name": "task_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } }, + { + "name": "job_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ], + "deprecated": true + } + }, + "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluateResponse" + } + } + } + } + }, + "tags": [ + "Eval" + ], + "description": "", + "parameters": [ { "name": "benchmark_id", "in": "path", @@ -1539,11 +1648,19 @@ "schema": { "type": "string" } + }, + { + "name": "job_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } } ] } }, - "/v1/datasets": { + "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": { "get": { "responses": { "200": { @@ -1551,14 +1668,53 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ListDatasetsResponse" + "$ref": "#/components/schemas/EvaluateResponse" } } } } }, "tags": [ - "Datasets" + "Eval" + ], + "description": "", + "parameters": [ + { + "name": "task_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "job_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ], + "deprecated": true + } + }, + "/v1/eval/benchmarks": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListBenchmarksResponse" + } + } + } + } + }, + "tags": [ + "Benchmarks" ], "description": "", "parameters": [] @@ -1570,7 +1726,7 @@ } }, "tags": [ - "Datasets" + "Benchmarks" ], "description": "", "parameters": [], @@ -1578,7 +1734,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/RegisterDatasetRequest" + "$ref": "#/components/schemas/RegisterBenchmarkRequest" } } }, @@ -1586,7 +1742,7 @@ } } }, - "/v1/eval/tasks": { + "/v1/datasets": { "get": { "responses": { "200": { @@ -1594,14 +1750,14 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ListBenchmarksResponse" + "$ref": "#/components/schemas/ListDatasetsResponse" } } } } }, "tags": [ - "Benchmarks" + "Datasets" ], "description": "", "parameters": [] @@ -1613,7 +1769,7 @@ } }, "tags": [ - "Benchmarks" + "Datasets" ], "description": "", "parameters": [], @@ -1621,7 +1777,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/RegisterBenchmarkRequest" + "$ref": "#/components/schemas/RegisterDatasetRequest" } } }, @@ -2204,7 +2360,7 @@ ] } }, - "/v1/eval/tasks/{benchmark_id}/jobs": { + "/v1/eval/benchmarks/{benchmark_id}/jobs": { "post": { "responses": { "200": { @@ -2244,6 +2400,47 @@ } } }, + "/v1/eval/tasks/{task_id}/jobs": { + "post": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Job" + } + } + } + } + }, + "tags": [ + "Eval" + ], + "description": "", + "parameters": [ + { + "name": "task_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RunEvalDeprecatedRequest" + } + } + }, + "required": true + }, + "deprecated": true + } + }, "/v1/safety/run-shield": { "post": { "responses": { @@ -2526,10 +2723,10 @@ "data" ] }, - "DeprecatedRegisterBenchmarkRequest": { + "DeprecatedRegisterEvalTaskRequest": { "type": "object", "properties": { - "benchmark_id": { + "task_id": { "type": "string" }, "dataset_id": { @@ -2575,7 +2772,7 @@ }, "additionalProperties": false, "required": [ - "benchmark_id", + "task_id", "dataset_id", "scoring_functions" ] @@ -4745,34 +4942,6 @@ "accuracy" ] }, - "AppBenchmarkConfig": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "app", - "default": "app" - }, - "eval_candidate": { - "$ref": "#/components/schemas/EvalCandidate" - }, - "scoring_params": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringFnParams" - } - }, - "num_examples": { - "type": "integer" - } - }, - "additionalProperties": false, - "required": [ - "type", - "eval_candidate", - "scoring_params" - ] - }, "BasicScoringFnParams": { "type": "object", "properties": { @@ -4793,25 +4962,26 @@ "type" ] }, - "BenchmarkBenchmarkConfig": { + "BenchmarkConfig": { "type": "object", "properties": { - "type": { - "type": "string", - "const": "benchmark", - "default": "benchmark" - }, "eval_candidate": { "$ref": "#/components/schemas/EvalCandidate" }, + "scoring_params": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ScoringFnParams" + } + }, "num_examples": { "type": "integer" } }, "additionalProperties": false, "required": [ - "type", - "eval_candidate" + "eval_candidate", + "scoring_params" ] }, "EvalCandidate": { @@ -4831,23 +5001,6 @@ } } }, - "BenchmarkConfig": { - "oneOf": [ - { - "$ref": "#/components/schemas/BenchmarkBenchmarkConfig" - }, - { - "$ref": "#/components/schemas/AppBenchmarkConfig" - } - ], - "discriminator": { - "propertyName": "type", - "mapping": { - "benchmark": "#/components/schemas/BenchmarkBenchmarkConfig", - "app": "#/components/schemas/AppBenchmarkConfig" - } - } - }, "LLMAsJudgeScoringFnParams": { "type": "object", "properties": { @@ -5108,6 +5261,54 @@ "aggregated_results" ] }, + "EvaluateRowsDeprecatedRequest": { + "type": "object", + "properties": { + "input_rows": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "scoring_functions": { + "type": "array", + "items": { + "type": "string" + } + }, + "task_config": { + "$ref": "#/components/schemas/BenchmarkConfig" + } + }, + "additionalProperties": false, + "required": [ + "input_rows", + "scoring_functions", + "task_config" + ] + }, "Session": { "type": "object", "properties": { @@ -7304,22 +7505,22 @@ "data" ] }, - "RegisterDatasetRequest": { + "RegisterBenchmarkRequest": { "type": "object", "properties": { + "benchmark_id": { + "type": "string" + }, "dataset_id": { "type": "string" }, - "dataset_schema": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ParamType" + "scoring_functions": { + "type": "array", + "items": { + "type": "string" } }, - "url": { - "$ref": "#/components/schemas/URL" - }, - "provider_dataset_id": { + "provider_benchmark_id": { "type": "string" }, "provider_id": { @@ -7353,27 +7554,27 @@ }, "additionalProperties": false, "required": [ + "benchmark_id", "dataset_id", - "dataset_schema", - "url" + "scoring_functions" ] }, - "RegisterBenchmarkRequest": { + "RegisterDatasetRequest": { "type": "object", "properties": { - "benchmark_id": { - "type": "string" - }, "dataset_id": { "type": "string" }, - "scoring_functions": { - "type": "array", - "items": { - "type": "string" + "dataset_schema": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ParamType" } }, - "provider_benchmark_id": { + "url": { + "$ref": "#/components/schemas/URL" + }, + "provider_dataset_id": { "type": "string" }, "provider_id": { @@ -7407,9 +7608,9 @@ }, "additionalProperties": false, "required": [ - "benchmark_id", "dataset_id", - "scoring_functions" + "dataset_schema", + "url" ] }, "RegisterModelRequest": { @@ -7623,6 +7824,18 @@ "job_id" ] }, + "RunEvalDeprecatedRequest": { + "type": "object", + "properties": { + "task_config": { + "$ref": "#/components/schemas/BenchmarkConfig" + } + }, + "additionalProperties": false, + "required": [ + "task_config" + ] + }, "RunShieldRequest": { "type": "object", "properties": { @@ -8105,6 +8318,9 @@ { "name": "BatchInference (Coming Soon)" }, + { + "name": "Benchmarks" + }, { "name": "DatasetIO" }, @@ -8114,9 +8330,6 @@ { "name": "Eval" }, - { - "name": "Benchmarks" - }, { "name": "Inference", "description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.", @@ -8168,10 +8381,10 @@ "tags": [ "Agents", "BatchInference (Coming Soon)", + "Benchmarks", "DatasetIO", "Datasets", "Eval", - "Benchmarks", "Inference", "Inspect", "Models", diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 0f0a613a81..89e0669177 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -25,8 +25,8 @@ paths: - Benchmarks description: '' parameters: - - name: benchmark_id - in: path + - name: task_id + in: query required: true schema: type: string @@ -57,7 +57,7 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/DeprecatedRegisterBenchmarkRequest' + $ref: '#/components/schemas/DeprecatedRegisterEvalTaskRequest' required: true deprecated: true /v1/datasetio/rows: @@ -372,7 +372,7 @@ paths: schema: $ref: '#/components/schemas/EmbeddingsRequest' required: true - /v1/eval/tasks/{benchmark_id}/evaluations: + /v1/eval/benchmarks/{benchmark_id}/evaluations: post: responses: '200': @@ -396,6 +396,31 @@ paths: schema: $ref: '#/components/schemas/EvaluateRowsRequest' required: true + /v1/eval/tasks/{task_id}/evaluations: + post: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/EvaluateResponse' + tags: + - Eval + description: '' + parameters: + - name: task_id + in: path + required: true + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/EvaluateRowsDeprecatedRequest' + required: true + deprecated: true /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}: get: responses: @@ -457,7 +482,7 @@ paths: required: true schema: type: string - /v1/datasets/{dataset_id}: + /v1/eval/benchmarks/{benchmark_id}: get: responses: '200': @@ -466,21 +491,28 @@ paths: application/json: schema: oneOf: - - $ref: '#/components/schemas/Dataset' + - $ref: '#/components/schemas/Benchmark' - type: 'null' tags: - - Datasets + - Benchmarks description: '' parameters: - - name: dataset_id + - name: benchmark_id in: path required: true schema: type: string - delete: + /v1/datasets/{dataset_id}: + get: responses: '200': description: OK + content: + application/json: + schema: + oneOf: + - $ref: '#/components/schemas/Dataset' + - type: 'null' tags: - Datasets description: '' @@ -490,22 +522,15 @@ paths: required: true schema: type: string - /v1/eval/tasks/{benchmark_id}: - get: + delete: responses: '200': description: OK - content: - application/json: - schema: - oneOf: - - $ref: '#/components/schemas/Benchmark' - - type: 'null' tags: - - Benchmarks + - Datasets description: '' parameters: - - name: benchmark_id + - name: dataset_id in: path required: true schema: @@ -852,7 +877,7 @@ paths: schema: $ref: '#/components/schemas/InvokeToolRequest' required: true - /v1/eval/tasks/{benchmark_id}/jobs/{job_id}: + /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}: get: responses: '200': @@ -895,7 +920,7 @@ paths: required: true schema: type: string - /v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result: + /v1/eval/tasks/{task_id}/jobs/{job_id}: get: responses: '200': @@ -903,22 +928,67 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/EvaluateResponse' + oneOf: + - $ref: '#/components/schemas/JobStatus' + - type: 'null' tags: - Eval description: '' parameters: + - name: task_id + in: path + required: true + schema: + type: string - name: job_id in: path required: true schema: type: string + deprecated: true + delete: + responses: + '200': + description: OK + tags: + - Eval + description: '' + parameters: + - name: task_id + in: path + required: true + schema: + type: string + - name: job_id + in: path + required: true + schema: + type: string + deprecated: true + /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/EvaluateResponse' + tags: + - Eval + description: '' + parameters: - name: benchmark_id in: path required: true schema: type: string - /v1/datasets: + - name: job_id + in: path + required: true + schema: + type: string + /v1/eval/tasks/{task_id}/jobs/{job_id}/result: get: responses: '200': @@ -926,9 +996,33 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/ListDatasetsResponse' + $ref: '#/components/schemas/EvaluateResponse' tags: - - Datasets + - Eval + description: '' + parameters: + - name: task_id + in: path + required: true + schema: + type: string + - name: job_id + in: path + required: true + schema: + type: string + deprecated: true + /v1/eval/benchmarks: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/ListBenchmarksResponse' + tags: + - Benchmarks description: '' parameters: [] post: @@ -936,16 +1030,16 @@ paths: '200': description: OK tags: - - Datasets + - Benchmarks description: '' parameters: [] requestBody: content: application/json: schema: - $ref: '#/components/schemas/RegisterDatasetRequest' + $ref: '#/components/schemas/RegisterBenchmarkRequest' required: true - /v1/eval/tasks: + /v1/datasets: get: responses: '200': @@ -953,9 +1047,9 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/ListBenchmarksResponse' + $ref: '#/components/schemas/ListDatasetsResponse' tags: - - Benchmarks + - Datasets description: '' parameters: [] post: @@ -963,14 +1057,14 @@ paths: '200': description: OK tags: - - Benchmarks + - Datasets description: '' parameters: [] requestBody: content: application/json: schema: - $ref: '#/components/schemas/RegisterBenchmarkRequest' + $ref: '#/components/schemas/RegisterDatasetRequest' required: true /v1/models: get: @@ -1328,7 +1422,7 @@ paths: type: array items: type: string - /v1/eval/tasks/{benchmark_id}/jobs: + /v1/eval/benchmarks/{benchmark_id}/jobs: post: responses: '200': @@ -1352,6 +1446,31 @@ paths: schema: $ref: '#/components/schemas/RunEvalRequest' required: true + /v1/eval/tasks/{task_id}/jobs: + post: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/Job' + tags: + - Eval + description: '' + parameters: + - name: task_id + in: path + required: true + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RunEvalDeprecatedRequest' + required: true + deprecated: true /v1/safety/run-shield: post: responses: @@ -1527,10 +1646,10 @@ components: additionalProperties: false required: - data - DeprecatedRegisterBenchmarkRequest: + DeprecatedRegisterEvalTaskRequest: type: object properties: - benchmark_id: + task_id: type: string dataset_id: type: string @@ -1554,7 +1673,7 @@ components: - type: object additionalProperties: false required: - - benchmark_id + - task_id - dataset_id - scoring_functions AppendRowsRequest: @@ -3063,26 +3182,6 @@ components: - median - categorical_count - accuracy - AppBenchmarkConfig: - type: object - properties: - type: - type: string - const: app - default: app - eval_candidate: - $ref: '#/components/schemas/EvalCandidate' - scoring_params: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringFnParams' - num_examples: - type: integer - additionalProperties: false - required: - - type - - eval_candidate - - scoring_params BasicScoringFnParams: type: object properties: @@ -3097,21 +3196,21 @@ components: additionalProperties: false required: - type - BenchmarkBenchmarkConfig: + BenchmarkConfig: type: object properties: - type: - type: string - const: benchmark - default: benchmark eval_candidate: $ref: '#/components/schemas/EvalCandidate' + scoring_params: + type: object + additionalProperties: + $ref: '#/components/schemas/ScoringFnParams' num_examples: type: integer additionalProperties: false required: - - type - eval_candidate + - scoring_params EvalCandidate: oneOf: - $ref: '#/components/schemas/ModelCandidate' @@ -3121,15 +3220,6 @@ components: mapping: model: '#/components/schemas/ModelCandidate' agent: '#/components/schemas/AgentCandidate' - BenchmarkConfig: - oneOf: - - $ref: '#/components/schemas/BenchmarkBenchmarkConfig' - - $ref: '#/components/schemas/AppBenchmarkConfig' - discriminator: - propertyName: type - mapping: - benchmark: '#/components/schemas/BenchmarkBenchmarkConfig' - app: '#/components/schemas/AppBenchmarkConfig' LLMAsJudgeScoringFnParams: type: object properties: @@ -3278,6 +3368,32 @@ components: required: - score_rows - aggregated_results + EvaluateRowsDeprecatedRequest: + type: object + properties: + input_rows: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + scoring_functions: + type: array + items: + type: string + task_config: + $ref: '#/components/schemas/BenchmarkConfig' + additionalProperties: false + required: + - input_rows + - scoring_functions + - task_config Session: type: object properties: @@ -4645,18 +4761,18 @@ components: additionalProperties: false required: - data - RegisterDatasetRequest: + RegisterBenchmarkRequest: type: object properties: + benchmark_id: + type: string dataset_id: type: string - dataset_schema: - type: object - additionalProperties: - $ref: '#/components/schemas/ParamType' - url: - $ref: '#/components/schemas/URL' - provider_dataset_id: + scoring_functions: + type: array + items: + type: string + provider_benchmark_id: type: string provider_id: type: string @@ -4672,21 +4788,21 @@ components: - type: object additionalProperties: false required: + - benchmark_id - dataset_id - - dataset_schema - - url - RegisterBenchmarkRequest: + - scoring_functions + RegisterDatasetRequest: type: object properties: - benchmark_id: - type: string dataset_id: type: string - scoring_functions: - type: array - items: - type: string - provider_benchmark_id: + dataset_schema: + type: object + additionalProperties: + $ref: '#/components/schemas/ParamType' + url: + $ref: '#/components/schemas/URL' + provider_dataset_id: type: string provider_id: type: string @@ -4702,9 +4818,9 @@ components: - type: object additionalProperties: false required: - - benchmark_id - dataset_id - - scoring_functions + - dataset_schema + - url RegisterModelRequest: type: object properties: @@ -4827,6 +4943,14 @@ components: additionalProperties: false required: - job_id + RunEvalDeprecatedRequest: + type: object + properties: + task_config: + $ref: '#/components/schemas/BenchmarkConfig' + additionalProperties: false + required: + - task_config RunShieldRequest: type: object properties: @@ -5125,10 +5249,10 @@ tags: x-displayName: >- Agents API for creating and interacting with agentic systems. - name: BatchInference (Coming Soon) + - name: Benchmarks - name: DatasetIO - name: Datasets - name: Eval - - name: Benchmarks - name: Inference description: >- This API provides the raw interface to the underlying models. Two kinds of models @@ -5159,10 +5283,10 @@ x-tagGroups: tags: - Agents - BatchInference (Coming Soon) + - Benchmarks - DatasetIO - Datasets - Eval - - Benchmarks - Inference - Inspect - Models diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py index 90b14131f4..b805e49762 100644 --- a/llama_stack/apis/eval/eval.py +++ b/llama_stack/apis/eval/eval.py @@ -83,3 +83,28 @@ async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ... @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET") async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ... + + @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST") + async def run_eval_DEPRECATED( + self, + task_id: str, + task_config: BenchmarkConfig, + ) -> Job: ... + + @webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST") + async def evaluate_rows_DEPRECATED( + self, + task_id: str, + input_rows: List[Dict[str, Any]], + scoring_functions: List[str], + task_config: BenchmarkConfig, + ) -> EvaluateResponse: ... + + @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET") + async def job_status_DEPRECATED(self, task_id: str, job_id: str) -> Optional[JobStatus]: ... + + @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE") + async def job_cancel_DEPRECATED(self, task_id: str, job_id: str) -> None: ... + + @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET") + async def job_result_DEPRECATED(self, task_id: str, job_id: str) -> EvaluateResponse: ... diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py index f9f3067670..9945ad367b 100644 --- a/llama_stack/distribution/routers/routers.py +++ b/llama_stack/distribution/routers/routers.py @@ -9,7 +9,6 @@ from llama_stack.apis.common.content_types import InterleavedContent, URL from llama_stack.apis.datasetio import DatasetIO, PaginatedRowsResult from llama_stack.apis.eval import ( - AppBenchmarkConfig, BenchmarkConfig, Eval, EvaluateResponse, @@ -348,7 +347,7 @@ async def shutdown(self) -> None: async def run_eval( self, benchmark_id: str, - task_config: AppBenchmarkConfig, + task_config: BenchmarkConfig, ) -> Job: return await self.routing_table.get_provider_impl(benchmark_id).run_eval( benchmark_id=benchmark_id, From 2d0f6865ac1aaba44ada97caed89b4daaa79e1d4 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Feb 2025 21:05:39 -0800 Subject: [PATCH 13/31] fix --- docs/_static/llama-stack-spec.html | 3616 ++++++++--------- docs/_static/llama-stack-spec.yaml | 1476 +++---- .../Llama_Stack_Benchmark_Evals.ipynb | 4 +- llama_stack/apis/eval/eval.py | 10 +- llama_stack/distribution/routers/routers.py | 42 + .../distribution/routers/routing_tables.py | 12 +- .../inline/eval/meta_reference/eval.py | 42 + 7 files changed, 2643 insertions(+), 2559 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 652dae562c..381f37f1f0 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -40,6 +40,47 @@ } ], "paths": { + "/v1/eval/tasks/{task_id}/evaluations": { + "post": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluateResponse" + } + } + } + } + }, + "tags": [ + "Eval" + ], + "description": "", + "parameters": [ + { + "name": "task_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/DeprecatedEvaluateRowsRequest" + } + } + }, + "required": true + }, + "deprecated": true + } + }, "/v1/eval-tasks/{benchmark_id}": { "get": { "responses": { @@ -78,6 +119,121 @@ "deprecated": true } }, + "/v1/eval/tasks/{task_id}/jobs/{job_id}": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "oneOf": [ + { + "$ref": "#/components/schemas/JobStatus" + }, + { + "type": "null" + } + ] + } + } + } + } + }, + "tags": [ + "Eval" + ], + "description": "", + "parameters": [ + { + "name": "task_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "job_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ], + "deprecated": true + }, + "delete": { + "responses": { + "200": { + "description": "OK" + } + }, + "tags": [ + "Eval" + ], + "description": "", + "parameters": [ + { + "name": "task_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "job_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ], + "deprecated": true + } + }, + "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluateResponse" + } + } + } + } + }, + "tags": [ + "Eval" + ], + "description": "", + "parameters": [ + { + "name": "task_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "job_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ], + "deprecated": true + } + }, "/v1/eval-tasks": { "get": { "responses": { @@ -123,6 +279,47 @@ "deprecated": true } }, + "/v1/eval/tasks/{task_id}/jobs": { + "post": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Job" + } + } + } + } + }, + "tags": [ + "Eval" + ], + "description": "", + "parameters": [ + { + "name": "task_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/DeprecatedRunEvalRequest" + } + } + }, + "required": true + }, + "deprecated": true + } + }, "/v1/datasetio/rows": { "get": { "responses": { @@ -653,47 +850,6 @@ } } }, - "/v1/eval/tasks/{task_id}/evaluations": { - "post": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EvaluateResponse" - } - } - } - } - }, - "tags": [ - "Eval" - ], - "description": "", - "parameters": [ - { - "name": "task_id", - "in": "path", - "required": true, - "schema": { - "type": "string" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EvaluateRowsDeprecatedRequest" - } - } - }, - "required": true - }, - "deprecated": true - } - }, "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}": { "get": { "responses": { @@ -1546,7 +1702,7 @@ ] } }, - "/v1/eval/tasks/{task_id}/jobs/{job_id}": { + "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": { "get": { "responses": { "200": { @@ -1554,14 +1710,7 @@ "content": { "application/json": { "schema": { - "oneOf": [ - { - "$ref": "#/components/schemas/JobStatus" - }, - { - "type": "null" - } - ] + "$ref": "#/components/schemas/EvaluateResponse" } } } @@ -1573,76 +1722,7 @@ "description": "", "parameters": [ { - "name": "task_id", - "in": "path", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "job_id", - "in": "path", - "required": true, - "schema": { - "type": "string" - } - } - ], - "deprecated": true - }, - "delete": { - "responses": { - "200": { - "description": "OK" - } - }, - "tags": [ - "Eval" - ], - "description": "", - "parameters": [ - { - "name": "task_id", - "in": "path", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "job_id", - "in": "path", - "required": true, - "schema": { - "type": "string" - } - } - ], - "deprecated": true - } - }, - "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EvaluateResponse" - } - } - } - } - }, - "tags": [ - "Eval" - ], - "description": "", - "parameters": [ - { - "name": "benchmark_id", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -1660,45 +1740,6 @@ ] } }, - "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EvaluateResponse" - } - } - } - } - }, - "tags": [ - "Eval" - ], - "description": "", - "parameters": [ - { - "name": "task_id", - "in": "path", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "job_id", - "in": "path", - "required": true, - "schema": { - "type": "string" - } - } - ], - "deprecated": true - } - }, "/v1/eval/benchmarks": { "get": { "responses": { @@ -2400,47 +2441,6 @@ } } }, - "/v1/eval/tasks/{task_id}/jobs": { - "post": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/Job" - } - } - } - } - }, - "tags": [ - "Eval" - ], - "description": "", - "parameters": [ - { - "name": "task_id", - "in": "path", - "required": true, - "schema": { - "type": "string" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RunEvalDeprecatedRequest" - } - } - }, - "required": true - }, - "deprecated": true - } - }, "/v1/safety/run-shield": { "post": { "responses": { @@ -2645,216 +2645,211 @@ "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema", "components": { "schemas": { - "Benchmark": { + "AgentCandidate": { "type": "object", "properties": { - "identifier": { - "type": "string" - }, - "provider_resource_id": { - "type": "string" - }, - "provider_id": { - "type": "string" - }, "type": { "type": "string", - "const": "benchmark", - "default": "benchmark" - }, - "dataset_id": { - "type": "string" - }, - "scoring_functions": { - "type": "array", - "items": { - "type": "string" - } + "const": "agent", + "default": "agent" }, - "metadata": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } + "config": { + "$ref": "#/components/schemas/AgentConfig" } }, "additionalProperties": false, "required": [ - "identifier", - "provider_resource_id", - "provider_id", "type", - "dataset_id", - "scoring_functions", - "metadata" + "config" ] }, - "ListBenchmarksResponse": { + "AgentConfig": { "type": "object", "properties": { - "data": { + "sampling_params": { + "$ref": "#/components/schemas/SamplingParams" + }, + "input_shields": { "type": "array", "items": { - "$ref": "#/components/schemas/Benchmark" + "type": "string" } - } - }, - "additionalProperties": false, - "required": [ - "data" - ] - }, - "DeprecatedRegisterEvalTaskRequest": { - "type": "object", - "properties": { - "task_id": { - "type": "string" - }, - "dataset_id": { - "type": "string" }, - "scoring_functions": { + "output_shields": { "type": "array", "items": { "type": "string" } }, - "provider_benchmark_id": { - "type": "string" + "toolgroups": { + "type": "array", + "items": { + "$ref": "#/components/schemas/AgentTool" + } }, - "provider_id": { - "type": "string" + "client_tools": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ToolDef" + } }, - "metadata": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } + "tool_choice": { + "type": "string", + "enum": [ + "auto", + "required" + ], + "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model.", + "default": "auto" + }, + "tool_prompt_format": { + "type": "string", + "enum": [ + "json", + "function_tag", + "python_list" + ], + "description": "Prompt format for calling custom / zero shot tools." + }, + "tool_config": { + "$ref": "#/components/schemas/ToolConfig" + }, + "max_infer_iters": { + "type": "integer", + "default": 10 + }, + "model": { + "type": "string" + }, + "instructions": { + "type": "string" + }, + "enable_session_persistence": { + "type": "boolean" + }, + "response_format": { + "$ref": "#/components/schemas/ResponseFormat" } }, "additionalProperties": false, "required": [ - "task_id", - "dataset_id", - "scoring_functions" + "model", + "instructions", + "enable_session_persistence" ] }, - "AppendRowsRequest": { + "AgentTool": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "args": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "name", + "args" + ] + } + ] + }, + "AggregationFunctionType": { + "type": "string", + "enum": [ + "average", + "median", + "categorical_count", + "accuracy" + ] + }, + "BasicScoringFnParams": { "type": "object", "properties": { - "dataset_id": { - "type": "string" + "type": { + "type": "string", + "const": "basic", + "default": "basic" }, - "rows": { + "aggregation_functions": { "type": "array", "items": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } + "$ref": "#/components/schemas/AggregationFunctionType" } } }, "additionalProperties": false, "required": [ - "dataset_id", - "rows" + "type" ] }, - "CompletionMessage": { + "BenchmarkConfig": { "type": "object", "properties": { - "role": { - "type": "string", - "const": "assistant", - "default": "assistant", - "description": "Must be \"assistant\" to identify this as the model's response" - }, - "content": { - "$ref": "#/components/schemas/InterleavedContent", - "description": "The content of the model's response" + "eval_candidate": { + "$ref": "#/components/schemas/EvalCandidate" }, - "stop_reason": { - "type": "string", - "enum": [ - "end_of_turn", - "end_of_message", - "out_of_tokens" - ], - "description": "Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: The model finished generating the entire response. - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response. - `StopReason.out_of_tokens`: The model ran out of token budget." + "scoring_params": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ScoringFnParams" + } }, - "tool_calls": { - "type": "array", - "items": { - "$ref": "#/components/schemas/ToolCall" - }, - "description": "List of tool calls. Each tool call is a ToolCall object." + "num_examples": { + "type": "integer" } }, "additionalProperties": false, "required": [ - "role", - "content", - "stop_reason" + "eval_candidate", + "scoring_params" + ] + }, + "EvalCandidate": { + "oneOf": [ + { + "$ref": "#/components/schemas/ModelCandidate" + }, + { + "$ref": "#/components/schemas/AgentCandidate" + } ], - "description": "A message containing the model's (assistant) response in a chat conversation." + "discriminator": { + "propertyName": "type", + "mapping": { + "model": "#/components/schemas/ModelCandidate", + "agent": "#/components/schemas/AgentCandidate" + } + } }, "GrammarResponseFormat": { "type": "object", @@ -3022,38 +3017,97 @@ ], "description": "Configuration for JSON schema-guided response generation." }, - "Message": { - "oneOf": [ - { - "$ref": "#/components/schemas/UserMessage" + "LLMAsJudgeScoringFnParams": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm_as_judge", + "default": "llm_as_judge" }, - { - "$ref": "#/components/schemas/SystemMessage" + "judge_model": { + "type": "string" }, - { - "$ref": "#/components/schemas/ToolResponseMessage" + "prompt_template": { + "type": "string" }, - { - "$ref": "#/components/schemas/CompletionMessage" - } - ], - "discriminator": { - "propertyName": "role", - "mapping": { - "user": "#/components/schemas/UserMessage", - "system": "#/components/schemas/SystemMessage", - "tool": "#/components/schemas/ToolResponseMessage", - "assistant": "#/components/schemas/CompletionMessage" + "judge_score_regexes": { + "type": "array", + "items": { + "type": "string" + } + }, + "aggregation_functions": { + "type": "array", + "items": { + "$ref": "#/components/schemas/AggregationFunctionType" + } } - } + }, + "additionalProperties": false, + "required": [ + "type", + "judge_model" + ] }, - "ResponseFormat": { - "oneOf": [ - { - "$ref": "#/components/schemas/JsonSchemaResponseFormat" + "ModelCandidate": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "model", + "default": "model" }, - { - "$ref": "#/components/schemas/GrammarResponseFormat" + "model": { + "type": "string" + }, + "sampling_params": { + "$ref": "#/components/schemas/SamplingParams" + }, + "system_message": { + "$ref": "#/components/schemas/SystemMessage" + } + }, + "additionalProperties": false, + "required": [ + "type", + "model", + "sampling_params" + ] + }, + "RegexParserScoringFnParams": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "regex_parser", + "default": "regex_parser" + }, + "parsing_regexes": { + "type": "array", + "items": { + "type": "string" + } + }, + "aggregation_functions": { + "type": "array", + "items": { + "$ref": "#/components/schemas/AggregationFunctionType" + } + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + "ResponseFormat": { + "oneOf": [ + { + "$ref": "#/components/schemas/JsonSchemaResponseFormat" + }, + { + "$ref": "#/components/schemas/GrammarResponseFormat" } ], "discriminator": { @@ -3105,6 +3159,27 @@ } } }, + "ScoringFnParams": { + "oneOf": [ + { + "$ref": "#/components/schemas/LLMAsJudgeScoringFnParams" + }, + { + "$ref": "#/components/schemas/RegexParserScoringFnParams" + }, + { + "$ref": "#/components/schemas/BasicScoringFnParams" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams", + "regex_parser": "#/components/schemas/RegexParserScoringFnParams", + "basic": "#/components/schemas/BasicScoringFnParams" + } + } + }, "SystemMessage": { "type": "object", "properties": { @@ -3147,90 +3222,79 @@ ], "description": "A text content item" }, - "ToolCall": { + "ToolConfig": { "type": "object", "properties": { - "call_id": { + "tool_choice": { + "type": "string", + "enum": [ + "auto", + "required" + ], + "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.", + "default": "auto" + }, + "tool_prompt_format": { + "type": "string", + "enum": [ + "json", + "function_tag", + "python_list" + ], + "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls." + }, + "system_message_behavior": { + "type": "string", + "enum": [ + "append", + "replace" + ], + "description": "(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string '{{function_definitions}}' to indicate where the function definitions should be inserted.", + "default": "append" + } + }, + "additionalProperties": false, + "required": [ + "system_message_behavior" + ], + "description": "Configuration for tool use." + }, + "ToolDef": { + "type": "object", + "properties": { + "name": { "type": "string" }, - "tool_name": { - "oneOf": [ - { - "type": "string", - "enum": [ - "brave_search", - "wolfram_alpha", - "photogen", - "code_interpreter" - ] - }, - { - "type": "string" - } - ] + "description": { + "type": "string" }, - "arguments": { + "parameters": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ToolParameter" + } + }, + "metadata": { "type": "object", "additionalProperties": { "oneOf": [ { - "type": "string" + "type": "null" }, { - "type": "integer" + "type": "boolean" }, { "type": "number" }, { - "type": "boolean" - }, - { - "type": "null" + "type": "string" }, { - "type": "array", - "items": { - "oneOf": [ - { - "type": "string" - }, - { - "type": "integer" - }, - { - "type": "number" - }, - { - "type": "boolean" - }, - { - "type": "null" - } - ] - } + "type": "array" }, { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "string" - }, - { - "type": "integer" - }, - { - "type": "number" - }, - { - "type": "boolean" - }, - { - "type": "null" - } - ] - } + "type": "object" } ] } @@ -3238,49 +3302,16 @@ }, "additionalProperties": false, "required": [ - "call_id", - "tool_name", - "arguments" + "name" ] }, - "ToolDefinition": { + "ToolParameter": { "type": "object", "properties": { - "tool_name": { - "oneOf": [ - { - "type": "string", - "enum": [ - "brave_search", - "wolfram_alpha", - "photogen", - "code_interpreter" - ] - }, - { - "type": "string" - } - ] - }, - "description": { + "name": { "type": "string" }, - "parameters": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ToolParamDefinition" - } - } - }, - "additionalProperties": false, - "required": [ - "tool_name" - ] - }, - "ToolParamDefinition": { - "type": "object", - "properties": { - "param_type": { + "parameter_type": { "type": "string" }, "description": { @@ -3315,60 +3346,19 @@ }, "additionalProperties": false, "required": [ - "param_type" + "name", + "parameter_type", + "description", + "required" ] }, - "ToolResponseMessage": { + "TopKSamplingStrategy": { "type": "object", "properties": { - "role": { - "type": "string", - "const": "tool", - "default": "tool", - "description": "Must be \"tool\" to identify this as a tool response" - }, - "call_id": { + "type": { "type": "string", - "description": "Unique identifier for the tool call this response is for" - }, - "tool_name": { - "oneOf": [ - { - "type": "string", - "enum": [ - "brave_search", - "wolfram_alpha", - "photogen", - "code_interpreter" - ] - }, - { - "type": "string" - } - ], - "description": "Name of the tool that was called" - }, - "content": { - "$ref": "#/components/schemas/InterleavedContent", - "description": "The response content from the tool" - } - }, - "additionalProperties": false, - "required": [ - "role", - "call_id", - "tool_name", - "content" - ], - "description": "A message representing the result of a tool invocation." - }, - "TopKSamplingStrategy": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "top_k", - "default": "top_k" + "const": "top_k", + "default": "top_k" }, "top_k": { "type": "integer" @@ -3413,597 +3403,692 @@ "uri" ] }, - "UserMessage": { - "type": "object", - "properties": { - "role": { - "type": "string", - "const": "user", - "default": "user", - "description": "Must be \"user\" to identify this as a user message" - }, - "content": { - "$ref": "#/components/schemas/InterleavedContent", - "description": "The content of the message, which can include text and other media" - }, - "context": { - "$ref": "#/components/schemas/InterleavedContent", - "description": "(Optional) This field is used internally by Llama Stack to pass RAG context. This field may be removed in the API in the future." - } - }, - "additionalProperties": false, - "required": [ - "role", - "content" - ], - "description": "A message from the user in a chat conversation." - }, - "BatchChatCompletionRequest": { + "DeprecatedEvaluateRowsRequest": { "type": "object", "properties": { - "model": { - "type": "string" - }, - "messages_batch": { + "input_rows": { "type": "array", "items": { - "type": "array", - "items": { - "$ref": "#/components/schemas/Message" + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] } } }, - "sampling_params": { - "$ref": "#/components/schemas/SamplingParams" - }, - "tools": { + "scoring_functions": { "type": "array", "items": { - "$ref": "#/components/schemas/ToolDefinition" + "type": "string" } }, - "tool_choice": { - "type": "string", - "enum": [ - "auto", - "required" - ], - "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model." - }, - "tool_prompt_format": { - "type": "string", - "enum": [ - "json", - "function_tag", - "python_list" - ], - "description": "Prompt format for calling custom / zero shot tools." - }, - "response_format": { - "$ref": "#/components/schemas/ResponseFormat" - }, - "logprobs": { - "type": "object", - "properties": { - "top_k": { - "type": "integer", - "default": 0, - "description": "How many tokens (for each position) to return log probabilities for." - } - }, - "additionalProperties": false + "task_config": { + "$ref": "#/components/schemas/BenchmarkConfig" } }, "additionalProperties": false, "required": [ - "model", - "messages_batch" + "input_rows", + "scoring_functions", + "task_config" ] }, - "BatchChatCompletionResponse": { + "EvaluateResponse": { "type": "object", "properties": { - "batch": { + "generations": { "type": "array", "items": { - "$ref": "#/components/schemas/ChatCompletionResponse" + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "scores": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ScoringResult" } } }, "additionalProperties": false, "required": [ - "batch" + "generations", + "scores" ] }, - "ChatCompletionResponse": { + "ScoringResult": { "type": "object", "properties": { - "completion_message": { - "$ref": "#/components/schemas/CompletionMessage", - "description": "The complete response message" - }, - "logprobs": { + "score_rows": { "type": "array", "items": { - "$ref": "#/components/schemas/TokenLogProbs" - }, - "description": "Optional log probabilities for generated tokens" - } - }, - "additionalProperties": false, - "required": [ - "completion_message" - ], - "description": "Response from a chat completion request." - }, - "TokenLogProbs": { - "type": "object", - "properties": { - "logprobs_by_token": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "aggregated_results": { "type": "object", "additionalProperties": { - "type": "number" - }, - "description": "Dictionary mapping tokens to their log probabilities" + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } } }, "additionalProperties": false, "required": [ - "logprobs_by_token" - ], - "description": "Log probabilities for generated tokens." + "score_rows", + "aggregated_results" + ] }, - "BatchCompletionRequest": { + "Benchmark": { "type": "object", "properties": { - "model": { + "identifier": { "type": "string" }, - "content_batch": { - "type": "array", - "items": { - "$ref": "#/components/schemas/InterleavedContent" - } - }, - "sampling_params": { - "$ref": "#/components/schemas/SamplingParams" + "provider_resource_id": { + "type": "string" }, - "response_format": { - "$ref": "#/components/schemas/ResponseFormat" + "provider_id": { + "type": "string" }, - "logprobs": { + "type": { + "type": "string", + "const": "benchmark", + "default": "benchmark" + }, + "dataset_id": { + "type": "string" + }, + "scoring_functions": { + "type": "array", + "items": { + "type": "string" + } + }, + "metadata": { "type": "object", - "properties": { - "top_k": { - "type": "integer", - "default": 0, - "description": "How many tokens (for each position) to return log probabilities for." - } - }, - "additionalProperties": false + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } } }, "additionalProperties": false, "required": [ - "model", - "content_batch" + "identifier", + "provider_resource_id", + "provider_id", + "type", + "dataset_id", + "scoring_functions", + "metadata" ] }, - "BatchCompletionResponse": { + "JobStatus": { + "type": "string", + "enum": [ + "completed", + "in_progress", + "failed", + "scheduled" + ] + }, + "ListBenchmarksResponse": { "type": "object", "properties": { - "batch": { + "data": { "type": "array", "items": { - "$ref": "#/components/schemas/CompletionResponse" + "$ref": "#/components/schemas/Benchmark" } } }, "additionalProperties": false, "required": [ - "batch" + "data" ] }, - "CompletionResponse": { + "DeprecatedRegisterEvalTaskRequest": { "type": "object", "properties": { - "content": { - "type": "string", - "description": "The generated completion text" + "task_id": { + "type": "string" }, - "stop_reason": { - "type": "string", - "enum": [ - "end_of_turn", - "end_of_message", - "out_of_tokens" - ], - "description": "Reason why generation stopped" + "dataset_id": { + "type": "string" }, - "logprobs": { + "scoring_functions": { "type": "array", "items": { - "$ref": "#/components/schemas/TokenLogProbs" - }, - "description": "Optional log probabilities for generated tokens" + "type": "string" + } + }, + "provider_benchmark_id": { + "type": "string" + }, + "provider_id": { + "type": "string" + }, + "metadata": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } } }, "additionalProperties": false, "required": [ - "content", - "stop_reason" - ], - "description": "Response from a completion request." + "task_id", + "dataset_id", + "scoring_functions" + ] }, - "CancelTrainingJobRequest": { + "DeprecatedRunEvalRequest": { "type": "object", "properties": { - "job_uuid": { + "task_config": { + "$ref": "#/components/schemas/BenchmarkConfig" + } + }, + "additionalProperties": false, + "required": [ + "task_config" + ] + }, + "Job": { + "type": "object", + "properties": { + "job_id": { "type": "string" } }, "additionalProperties": false, "required": [ - "job_uuid" + "job_id" ] }, - "ToolConfig": { + "AppendRowsRequest": { "type": "object", "properties": { - "tool_choice": { - "type": "string", - "enum": [ - "auto", - "required" - ], - "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.", - "default": "auto" - }, - "tool_prompt_format": { - "type": "string", - "enum": [ - "json", - "function_tag", - "python_list" - ], - "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls." + "dataset_id": { + "type": "string" }, - "system_message_behavior": { - "type": "string", - "enum": [ - "append", - "replace" - ], - "description": "(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string '{{function_definitions}}' to indicate where the function definitions should be inserted.", - "default": "append" + "rows": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } } }, "additionalProperties": false, "required": [ - "system_message_behavior" - ], - "description": "Configuration for tool use." + "dataset_id", + "rows" + ] }, - "ChatCompletionRequest": { + "CompletionMessage": { "type": "object", "properties": { - "model_id": { + "role": { "type": "string", - "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint." + "const": "assistant", + "default": "assistant", + "description": "Must be \"assistant\" to identify this as the model's response" }, - "messages": { - "type": "array", - "items": { - "$ref": "#/components/schemas/Message" - }, - "description": "List of messages in the conversation" + "content": { + "$ref": "#/components/schemas/InterleavedContent", + "description": "The content of the model's response" }, - "sampling_params": { - "$ref": "#/components/schemas/SamplingParams", - "description": "Parameters to control the sampling strategy" + "stop_reason": { + "type": "string", + "enum": [ + "end_of_turn", + "end_of_message", + "out_of_tokens" + ], + "description": "Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: The model finished generating the entire response. - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response. - `StopReason.out_of_tokens`: The model ran out of token budget." }, - "tools": { + "tool_calls": { "type": "array", "items": { - "$ref": "#/components/schemas/ToolDefinition" + "$ref": "#/components/schemas/ToolCall" }, - "description": "(Optional) List of tool definitions available to the model" - }, - "tool_choice": { - "type": "string", - "enum": [ - "auto", - "required" - ], - "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead." - }, - "tool_prompt_format": { - "type": "string", - "enum": [ - "json", - "function_tag", - "python_list" - ], - "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls. .. deprecated:: Use tool_config instead." - }, - "response_format": { - "$ref": "#/components/schemas/ResponseFormat", - "description": "(Optional) Grammar specification for guided (structured) decoding. There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it." - }, - "stream": { - "type": "boolean", - "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False." - }, - "logprobs": { - "type": "object", - "properties": { - "top_k": { - "type": "integer", - "default": 0, - "description": "How many tokens (for each position) to return log probabilities for." - } - }, - "additionalProperties": false, - "description": "(Optional) If specified, log probabilities for each token position will be returned." - }, - "tool_config": { - "$ref": "#/components/schemas/ToolConfig", - "description": "(Optional) Configuration for tool use." - } - }, - "additionalProperties": false, - "required": [ - "model_id", - "messages" - ] - }, - "ChatCompletionResponseEvent": { - "type": "object", - "properties": { - "event_type": { - "type": "string", - "enum": [ - "start", - "complete", - "progress" - ], - "description": "Type of the event" - }, - "delta": { - "$ref": "#/components/schemas/ContentDelta", - "description": "Content generated since last event. This can be one or more tokens, or a tool call." - }, - "logprobs": { - "type": "array", - "items": { - "$ref": "#/components/schemas/TokenLogProbs" - }, - "description": "Optional log probabilities for generated tokens" - }, - "stop_reason": { - "type": "string", - "enum": [ - "end_of_turn", - "end_of_message", - "out_of_tokens" - ], - "description": "Optional reason why generation stopped, if complete" - } - }, - "additionalProperties": false, - "required": [ - "event_type", - "delta" - ], - "description": "An event during chat completion generation." - }, - "ChatCompletionResponseStreamChunk": { - "type": "object", - "properties": { - "event": { - "$ref": "#/components/schemas/ChatCompletionResponseEvent", - "description": "The event containing the new content" + "description": "List of tool calls. Each tool call is a ToolCall object." } }, "additionalProperties": false, "required": [ - "event" + "role", + "content", + "stop_reason" ], - "description": "A chunk of a streamed chat completion response." + "description": "A message containing the model's (assistant) response in a chat conversation." }, - "ContentDelta": { + "Message": { "oneOf": [ { - "$ref": "#/components/schemas/TextDelta" + "$ref": "#/components/schemas/UserMessage" }, { - "$ref": "#/components/schemas/ImageDelta" + "$ref": "#/components/schemas/SystemMessage" }, { - "$ref": "#/components/schemas/ToolCallDelta" + "$ref": "#/components/schemas/ToolResponseMessage" + }, + { + "$ref": "#/components/schemas/CompletionMessage" } ], "discriminator": { - "propertyName": "type", + "propertyName": "role", "mapping": { - "text": "#/components/schemas/TextDelta", - "image": "#/components/schemas/ImageDelta", - "tool_call": "#/components/schemas/ToolCallDelta" + "user": "#/components/schemas/UserMessage", + "system": "#/components/schemas/SystemMessage", + "tool": "#/components/schemas/ToolResponseMessage", + "assistant": "#/components/schemas/CompletionMessage" } } }, - "ImageDelta": { + "ToolCall": { "type": "object", "properties": { - "type": { - "type": "string", - "const": "image", - "default": "image" + "call_id": { + "type": "string" }, - "image": { - "type": "string", - "contentEncoding": "base64" + "tool_name": { + "oneOf": [ + { + "type": "string", + "enum": [ + "brave_search", + "wolfram_alpha", + "photogen", + "code_interpreter" + ] + }, + { + "type": "string" + } + ] + }, + "arguments": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + }, + { + "type": "null" + }, + { + "type": "array", + "items": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + }, + { + "type": "null" + } + ] + } + }, + { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + }, + { + "type": "null" + } + ] + } + } + ] + } } }, "additionalProperties": false, "required": [ - "type", - "image" + "call_id", + "tool_name", + "arguments" ] }, - "TextDelta": { + "ToolDefinition": { "type": "object", "properties": { - "type": { - "type": "string", - "const": "text", - "default": "text" + "tool_name": { + "oneOf": [ + { + "type": "string", + "enum": [ + "brave_search", + "wolfram_alpha", + "photogen", + "code_interpreter" + ] + }, + { + "type": "string" + } + ] }, - "text": { + "description": { "type": "string" + }, + "parameters": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ToolParamDefinition" + } } }, "additionalProperties": false, "required": [ - "type", - "text" + "tool_name" ] }, - "ToolCallDelta": { + "ToolParamDefinition": { "type": "object", "properties": { - "type": { - "type": "string", - "const": "tool_call", - "default": "tool_call" + "param_type": { + "type": "string" }, - "tool_call": { + "description": { + "type": "string" + }, + "required": { + "type": "boolean", + "default": true + }, + "default": { "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, { "type": "string" }, { - "$ref": "#/components/schemas/ToolCall" + "type": "array" + }, + { + "type": "object" } ] - }, - "parse_status": { - "type": "string", - "enum": [ - "started", - "in_progress", - "failed", - "succeeded" - ] } }, "additionalProperties": false, "required": [ - "type", - "tool_call", - "parse_status" + "param_type" ] }, - "CompletionRequest": { + "ToolResponseMessage": { "type": "object", "properties": { - "model_id": { + "role": { "type": "string", - "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint." - }, - "content": { - "$ref": "#/components/schemas/InterleavedContent", - "description": "The content to generate a completion for" - }, - "sampling_params": { - "$ref": "#/components/schemas/SamplingParams", - "description": "(Optional) Parameters to control the sampling strategy" - }, - "response_format": { - "$ref": "#/components/schemas/ResponseFormat", - "description": "(Optional) Grammar specification for guided (structured) decoding" + "const": "tool", + "default": "tool", + "description": "Must be \"tool\" to identify this as a tool response" }, - "stream": { - "type": "boolean", - "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False." + "call_id": { + "type": "string", + "description": "Unique identifier for the tool call this response is for" }, - "logprobs": { - "type": "object", - "properties": { - "top_k": { - "type": "integer", - "default": 0, - "description": "How many tokens (for each position) to return log probabilities for." + "tool_name": { + "oneOf": [ + { + "type": "string", + "enum": [ + "brave_search", + "wolfram_alpha", + "photogen", + "code_interpreter" + ] + }, + { + "type": "string" } - }, - "additionalProperties": false, - "description": "(Optional) If specified, log probabilities for each token position will be returned." + ], + "description": "Name of the tool that was called" + }, + "content": { + "$ref": "#/components/schemas/InterleavedContent", + "description": "The response content from the tool" } }, "additionalProperties": false, "required": [ - "model_id", + "role", + "call_id", + "tool_name", "content" - ] + ], + "description": "A message representing the result of a tool invocation." }, - "CompletionResponseStreamChunk": { + "UserMessage": { "type": "object", "properties": { - "delta": { + "role": { "type": "string", - "description": "New content generated since last chunk. This can be one or more tokens." + "const": "user", + "default": "user", + "description": "Must be \"user\" to identify this as a user message" }, - "stop_reason": { - "type": "string", - "enum": [ - "end_of_turn", - "end_of_message", - "out_of_tokens" - ], - "description": "Optional reason why generation stopped, if complete" + "content": { + "$ref": "#/components/schemas/InterleavedContent", + "description": "The content of the message, which can include text and other media" }, - "logprobs": { - "type": "array", - "items": { - "$ref": "#/components/schemas/TokenLogProbs" - }, - "description": "Optional log probabilities for generated tokens" + "context": { + "$ref": "#/components/schemas/InterleavedContent", + "description": "(Optional) This field is used internally by Llama Stack to pass RAG context. This field may be removed in the API in the future." } }, "additionalProperties": false, "required": [ - "delta" + "role", + "content" ], - "description": "A chunk of a streamed completion response." + "description": "A message from the user in a chat conversation." }, - "AgentConfig": { + "BatchChatCompletionRequest": { "type": "object", "properties": { - "sampling_params": { - "$ref": "#/components/schemas/SamplingParams" - }, - "input_shields": { - "type": "array", - "items": { - "type": "string" - } + "model": { + "type": "string" }, - "output_shields": { + "messages_batch": { "type": "array", "items": { - "type": "string" + "type": "array", + "items": { + "$ref": "#/components/schemas/Message" + } } }, - "toolgroups": { - "type": "array", - "items": { - "$ref": "#/components/schemas/AgentTool" - } + "sampling_params": { + "$ref": "#/components/schemas/SamplingParams" }, - "client_tools": { + "tools": { "type": "array", "items": { - "$ref": "#/components/schemas/ToolDef" + "$ref": "#/components/schemas/ToolDefinition" } }, "tool_choice": { @@ -4012,8 +4097,7 @@ "auto", "required" ], - "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model.", - "default": "auto" + "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model." }, "tool_prompt_format": { "type": "string", @@ -4024,521 +4108,518 @@ ], "description": "Prompt format for calling custom / zero shot tools." }, - "tool_config": { - "$ref": "#/components/schemas/ToolConfig" - }, - "max_infer_iters": { - "type": "integer", - "default": 10 - }, - "model": { - "type": "string" - }, - "instructions": { - "type": "string" - }, - "enable_session_persistence": { - "type": "boolean" - }, "response_format": { "$ref": "#/components/schemas/ResponseFormat" + }, + "logprobs": { + "type": "object", + "properties": { + "top_k": { + "type": "integer", + "default": 0, + "description": "How many tokens (for each position) to return log probabilities for." + } + }, + "additionalProperties": false } }, "additionalProperties": false, "required": [ "model", - "instructions", - "enable_session_persistence" + "messages_batch" ] }, - "AgentTool": { - "oneOf": [ - { - "type": "string" - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "args": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - } - }, - "additionalProperties": false, - "required": [ - "name", - "args" - ] + "BatchChatCompletionResponse": { + "type": "object", + "properties": { + "batch": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ChatCompletionResponse" + } } + }, + "additionalProperties": false, + "required": [ + "batch" ] }, - "ToolDef": { + "ChatCompletionResponse": { "type": "object", "properties": { - "name": { - "type": "string" - }, - "description": { - "type": "string" + "completion_message": { + "$ref": "#/components/schemas/CompletionMessage", + "description": "The complete response message" }, - "parameters": { + "logprobs": { "type": "array", "items": { - "$ref": "#/components/schemas/ToolParameter" - } - }, - "metadata": { + "$ref": "#/components/schemas/TokenLogProbs" + }, + "description": "Optional log probabilities for generated tokens" + } + }, + "additionalProperties": false, + "required": [ + "completion_message" + ], + "description": "Response from a chat completion request." + }, + "TokenLogProbs": { + "type": "object", + "properties": { + "logprobs_by_token": { "type": "object", "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } + "type": "number" + }, + "description": "Dictionary mapping tokens to their log probabilities" } }, "additionalProperties": false, "required": [ - "name" - ] + "logprobs_by_token" + ], + "description": "Log probabilities for generated tokens." }, - "ToolParameter": { + "BatchCompletionRequest": { "type": "object", "properties": { - "name": { + "model": { "type": "string" }, - "parameter_type": { - "type": "string" + "content_batch": { + "type": "array", + "items": { + "$ref": "#/components/schemas/InterleavedContent" + } }, - "description": { - "type": "string" + "sampling_params": { + "$ref": "#/components/schemas/SamplingParams" }, - "required": { - "type": "boolean", - "default": true + "response_format": { + "$ref": "#/components/schemas/ResponseFormat" }, - "default": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" + "logprobs": { + "type": "object", + "properties": { + "top_k": { + "type": "integer", + "default": 0, + "description": "How many tokens (for each position) to return log probabilities for." } - ] + }, + "additionalProperties": false } }, "additionalProperties": false, "required": [ - "name", - "parameter_type", - "description", - "required" + "model", + "content_batch" ] }, - "CreateAgentRequest": { + "BatchCompletionResponse": { "type": "object", "properties": { - "agent_config": { - "$ref": "#/components/schemas/AgentConfig" + "batch": { + "type": "array", + "items": { + "$ref": "#/components/schemas/CompletionResponse" + } } }, "additionalProperties": false, "required": [ - "agent_config" + "batch" ] }, - "AgentCreateResponse": { + "CompletionResponse": { "type": "object", "properties": { - "agent_id": { - "type": "string" + "content": { + "type": "string", + "description": "The generated completion text" + }, + "stop_reason": { + "type": "string", + "enum": [ + "end_of_turn", + "end_of_message", + "out_of_tokens" + ], + "description": "Reason why generation stopped" + }, + "logprobs": { + "type": "array", + "items": { + "$ref": "#/components/schemas/TokenLogProbs" + }, + "description": "Optional log probabilities for generated tokens" } }, "additionalProperties": false, "required": [ - "agent_id" - ] + "content", + "stop_reason" + ], + "description": "Response from a completion request." }, - "CreateAgentSessionRequest": { + "CancelTrainingJobRequest": { "type": "object", "properties": { - "session_name": { + "job_uuid": { "type": "string" } }, "additionalProperties": false, "required": [ - "session_name" + "job_uuid" ] }, - "AgentSessionCreateResponse": { + "ChatCompletionRequest": { "type": "object", "properties": { - "session_id": { - "type": "string" + "model_id": { + "type": "string", + "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint." + }, + "messages": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Message" + }, + "description": "List of messages in the conversation" + }, + "sampling_params": { + "$ref": "#/components/schemas/SamplingParams", + "description": "Parameters to control the sampling strategy" + }, + "tools": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ToolDefinition" + }, + "description": "(Optional) List of tool definitions available to the model" + }, + "tool_choice": { + "type": "string", + "enum": [ + "auto", + "required" + ], + "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead." + }, + "tool_prompt_format": { + "type": "string", + "enum": [ + "json", + "function_tag", + "python_list" + ], + "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls. .. deprecated:: Use tool_config instead." + }, + "response_format": { + "$ref": "#/components/schemas/ResponseFormat", + "description": "(Optional) Grammar specification for guided (structured) decoding. There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it." + }, + "stream": { + "type": "boolean", + "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False." + }, + "logprobs": { + "type": "object", + "properties": { + "top_k": { + "type": "integer", + "default": 0, + "description": "How many tokens (for each position) to return log probabilities for." + } + }, + "additionalProperties": false, + "description": "(Optional) If specified, log probabilities for each token position will be returned." + }, + "tool_config": { + "$ref": "#/components/schemas/ToolConfig", + "description": "(Optional) Configuration for tool use." } }, "additionalProperties": false, "required": [ - "session_id" + "model_id", + "messages" ] }, - "CreateAgentTurnRequest": { + "ChatCompletionResponseEvent": { "type": "object", "properties": { - "messages": { - "type": "array", - "items": { - "oneOf": [ - { - "$ref": "#/components/schemas/UserMessage" - }, - { - "$ref": "#/components/schemas/ToolResponseMessage" - } - ] - } - }, - "stream": { - "type": "boolean" - }, - "documents": { - "type": "array", - "items": { - "type": "object", - "properties": { - "content": { - "oneOf": [ - { - "type": "string" - }, - { - "$ref": "#/components/schemas/InterleavedContentItem" - }, - { - "type": "array", - "items": { - "$ref": "#/components/schemas/InterleavedContentItem" - } - }, - { - "$ref": "#/components/schemas/URL" - } - ] - }, - "mime_type": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "content", - "mime_type" - ] - } + "event_type": { + "type": "string", + "enum": [ + "start", + "complete", + "progress" + ], + "description": "Type of the event" }, - "toolgroups": { + "delta": { + "$ref": "#/components/schemas/ContentDelta", + "description": "Content generated since last event. This can be one or more tokens, or a tool call." + }, + "logprobs": { "type": "array", "items": { - "$ref": "#/components/schemas/AgentTool" - } + "$ref": "#/components/schemas/TokenLogProbs" + }, + "description": "Optional log probabilities for generated tokens" }, - "tool_config": { - "$ref": "#/components/schemas/ToolConfig" + "stop_reason": { + "type": "string", + "enum": [ + "end_of_turn", + "end_of_message", + "out_of_tokens" + ], + "description": "Optional reason why generation stopped, if complete" } }, "additionalProperties": false, "required": [ - "messages" - ] + "event_type", + "delta" + ], + "description": "An event during chat completion generation." }, - "InferenceStep": { + "ChatCompletionResponseStreamChunk": { "type": "object", "properties": { - "turn_id": { - "type": "string" - }, - "step_id": { - "type": "string" + "event": { + "$ref": "#/components/schemas/ChatCompletionResponseEvent", + "description": "The event containing the new content" + } + }, + "additionalProperties": false, + "required": [ + "event" + ], + "description": "A chunk of a streamed chat completion response." + }, + "ContentDelta": { + "oneOf": [ + { + "$ref": "#/components/schemas/TextDelta" }, - "started_at": { - "type": "string", - "format": "date-time" + { + "$ref": "#/components/schemas/ImageDelta" }, - "completed_at": { + { + "$ref": "#/components/schemas/ToolCallDelta" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "text": "#/components/schemas/TextDelta", + "image": "#/components/schemas/ImageDelta", + "tool_call": "#/components/schemas/ToolCallDelta" + } + } + }, + "ImageDelta": { + "type": "object", + "properties": { + "type": { "type": "string", - "format": "date-time" + "const": "image", + "default": "image" }, - "step_type": { + "image": { "type": "string", - "const": "inference", - "default": "inference" - }, - "model_response": { - "$ref": "#/components/schemas/CompletionMessage" + "contentEncoding": "base64" } }, "additionalProperties": false, "required": [ - "turn_id", - "step_id", - "step_type", - "model_response" + "type", + "image" ] }, - "MemoryRetrievalStep": { + "TextDelta": { "type": "object", "properties": { - "turn_id": { - "type": "string" + "type": { + "type": "string", + "const": "text", + "default": "text" }, - "step_id": { + "text": { "type": "string" - }, - "started_at": { + } + }, + "additionalProperties": false, + "required": [ + "type", + "text" + ] + }, + "ToolCallDelta": { + "type": "object", + "properties": { + "type": { "type": "string", - "format": "date-time" + "const": "tool_call", + "default": "tool_call" }, - "completed_at": { - "type": "string", - "format": "date-time" + "tool_call": { + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/components/schemas/ToolCall" + } + ] }, - "step_type": { + "parse_status": { "type": "string", - "const": "memory_retrieval", - "default": "memory_retrieval" - }, - "vector_db_ids": { - "type": "string" - }, - "inserted_context": { - "$ref": "#/components/schemas/InterleavedContent" + "enum": [ + "started", + "in_progress", + "failed", + "succeeded" + ] } }, "additionalProperties": false, "required": [ - "turn_id", - "step_id", - "step_type", - "vector_db_ids", - "inserted_context" + "type", + "tool_call", + "parse_status" ] }, - "SafetyViolation": { + "CompletionRequest": { "type": "object", "properties": { - "violation_level": { - "$ref": "#/components/schemas/ViolationLevel" + "model_id": { + "type": "string", + "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint." }, - "user_message": { - "type": "string" + "content": { + "$ref": "#/components/schemas/InterleavedContent", + "description": "The content to generate a completion for" }, - "metadata": { + "sampling_params": { + "$ref": "#/components/schemas/SamplingParams", + "description": "(Optional) Parameters to control the sampling strategy" + }, + "response_format": { + "$ref": "#/components/schemas/ResponseFormat", + "description": "(Optional) Grammar specification for guided (structured) decoding" + }, + "stream": { + "type": "boolean", + "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False." + }, + "logprobs": { "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } + "properties": { + "top_k": { + "type": "integer", + "default": 0, + "description": "How many tokens (for each position) to return log probabilities for." + } + }, + "additionalProperties": false, + "description": "(Optional) If specified, log probabilities for each token position will be returned." } }, "additionalProperties": false, "required": [ - "violation_level", - "metadata" + "model_id", + "content" ] }, - "ShieldCallStep": { + "CompletionResponseStreamChunk": { "type": "object", "properties": { - "turn_id": { - "type": "string" - }, - "step_id": { - "type": "string" - }, - "started_at": { - "type": "string", - "format": "date-time" - }, - "completed_at": { + "delta": { "type": "string", - "format": "date-time" + "description": "New content generated since last chunk. This can be one or more tokens." }, - "step_type": { + "stop_reason": { "type": "string", - "const": "shield_call", - "default": "shield_call" + "enum": [ + "end_of_turn", + "end_of_message", + "out_of_tokens" + ], + "description": "Optional reason why generation stopped, if complete" }, - "violation": { - "$ref": "#/components/schemas/SafetyViolation" + "logprobs": { + "type": "array", + "items": { + "$ref": "#/components/schemas/TokenLogProbs" + }, + "description": "Optional log probabilities for generated tokens" + } + }, + "additionalProperties": false, + "required": [ + "delta" + ], + "description": "A chunk of a streamed completion response." + }, + "CreateAgentRequest": { + "type": "object", + "properties": { + "agent_config": { + "$ref": "#/components/schemas/AgentConfig" } }, "additionalProperties": false, "required": [ - "turn_id", - "step_id", - "step_type" + "agent_config" ] }, - "ToolExecutionStep": { + "AgentCreateResponse": { "type": "object", "properties": { - "turn_id": { - "type": "string" - }, - "step_id": { + "agent_id": { "type": "string" - }, - "started_at": { - "type": "string", - "format": "date-time" - }, - "completed_at": { - "type": "string", - "format": "date-time" - }, - "step_type": { - "type": "string", - "const": "tool_execution", - "default": "tool_execution" - }, - "tool_calls": { - "type": "array", - "items": { - "$ref": "#/components/schemas/ToolCall" - } - }, - "tool_responses": { - "type": "array", - "items": { - "$ref": "#/components/schemas/ToolResponse" - } } }, "additionalProperties": false, "required": [ - "turn_id", - "step_id", - "step_type", - "tool_calls", - "tool_responses" + "agent_id" ] }, - "ToolResponse": { + "CreateAgentSessionRequest": { "type": "object", "properties": { - "call_id": { + "session_name": { "type": "string" - }, - "tool_name": { - "oneOf": [ - { - "type": "string", - "enum": [ - "brave_search", - "wolfram_alpha", - "photogen", - "code_interpreter" - ] - }, - { - "type": "string" - } - ] - }, - "content": { - "$ref": "#/components/schemas/InterleavedContent" } }, "additionalProperties": false, "required": [ - "call_id", - "tool_name", - "content" + "session_name" ] }, - "Turn": { + "AgentSessionCreateResponse": { "type": "object", "properties": { - "turn_id": { - "type": "string" - }, "session_id": { "type": "string" - }, - "input_messages": { + } + }, + "additionalProperties": false, + "required": [ + "session_id" + ] + }, + "CreateAgentTurnRequest": { + "type": "object", + "properties": { + "messages": { "type": "array", "items": { "oneOf": [ @@ -4551,38 +4632,10 @@ ] } }, - "steps": { - "type": "array", - "items": { - "oneOf": [ - { - "$ref": "#/components/schemas/InferenceStep" - }, - { - "$ref": "#/components/schemas/ToolExecutionStep" - }, - { - "$ref": "#/components/schemas/ShieldCallStep" - }, - { - "$ref": "#/components/schemas/MemoryRetrievalStep" - } - ], - "discriminator": { - "propertyName": "step_type", - "mapping": { - "inference": "#/components/schemas/InferenceStep", - "tool_execution": "#/components/schemas/ToolExecutionStep", - "shield_call": "#/components/schemas/ShieldCallStep", - "memory_retrieval": "#/components/schemas/MemoryRetrievalStep" - } - } - } - }, - "output_message": { - "$ref": "#/components/schemas/CompletionMessage" + "stream": { + "type": "boolean" }, - "output_attachments": { + "documents": { "type": "array", "items": { "type": "object", @@ -4617,179 +4670,100 @@ ] } }, - "started_at": { - "type": "string", - "format": "date-time" + "toolgroups": { + "type": "array", + "items": { + "$ref": "#/components/schemas/AgentTool" + } }, - "completed_at": { - "type": "string", - "format": "date-time" + "tool_config": { + "$ref": "#/components/schemas/ToolConfig" } }, "additionalProperties": false, "required": [ - "turn_id", - "session_id", - "input_messages", - "steps", - "output_message", - "started_at" - ], - "description": "A single turn in an interaction with an Agentic System." - }, - "ViolationLevel": { - "type": "string", - "enum": [ - "info", - "warn", - "error" + "messages" ] }, - "AgentTurnResponseEvent": { + "InferenceStep": { "type": "object", "properties": { - "payload": { - "$ref": "#/components/schemas/AgentTurnResponseEventPayload" - } - }, - "additionalProperties": false, - "required": [ - "payload" - ] - }, - "AgentTurnResponseEventPayload": { - "oneOf": [ - { - "$ref": "#/components/schemas/AgentTurnResponseStepStartPayload" - }, - { - "$ref": "#/components/schemas/AgentTurnResponseStepProgressPayload" + "turn_id": { + "type": "string" }, - { - "$ref": "#/components/schemas/AgentTurnResponseStepCompletePayload" + "step_id": { + "type": "string" }, - { - "$ref": "#/components/schemas/AgentTurnResponseTurnStartPayload" + "started_at": { + "type": "string", + "format": "date-time" }, - { - "$ref": "#/components/schemas/AgentTurnResponseTurnCompletePayload" - } - ], - "discriminator": { - "propertyName": "event_type", - "mapping": { - "step_start": "#/components/schemas/AgentTurnResponseStepStartPayload", - "step_progress": "#/components/schemas/AgentTurnResponseStepProgressPayload", - "step_complete": "#/components/schemas/AgentTurnResponseStepCompletePayload", - "turn_start": "#/components/schemas/AgentTurnResponseTurnStartPayload", - "turn_complete": "#/components/schemas/AgentTurnResponseTurnCompletePayload" - } - } - }, - "AgentTurnResponseStepCompletePayload": { - "type": "object", - "properties": { - "event_type": { + "completed_at": { "type": "string", - "const": "step_complete", - "default": "step_complete" + "format": "date-time" }, "step_type": { "type": "string", - "enum": [ - "inference", - "tool_execution", - "shield_call", - "memory_retrieval" - ] - }, - "step_id": { - "type": "string" + "const": "inference", + "default": "inference" }, - "step_details": { - "oneOf": [ - { - "$ref": "#/components/schemas/InferenceStep" - }, - { - "$ref": "#/components/schemas/ToolExecutionStep" - }, - { - "$ref": "#/components/schemas/ShieldCallStep" - }, - { - "$ref": "#/components/schemas/MemoryRetrievalStep" - } - ], - "discriminator": { - "propertyName": "step_type", - "mapping": { - "inference": "#/components/schemas/InferenceStep", - "tool_execution": "#/components/schemas/ToolExecutionStep", - "shield_call": "#/components/schemas/ShieldCallStep", - "memory_retrieval": "#/components/schemas/MemoryRetrievalStep" - } - } + "model_response": { + "$ref": "#/components/schemas/CompletionMessage" } }, "additionalProperties": false, "required": [ - "event_type", - "step_type", + "turn_id", "step_id", - "step_details" + "step_type", + "model_response" ] }, - "AgentTurnResponseStepProgressPayload": { + "MemoryRetrievalStep": { "type": "object", "properties": { - "event_type": { + "turn_id": { + "type": "string" + }, + "step_id": { + "type": "string" + }, + "started_at": { "type": "string", - "const": "step_progress", - "default": "step_progress" + "format": "date-time" + }, + "completed_at": { + "type": "string", + "format": "date-time" }, "step_type": { "type": "string", - "enum": [ - "inference", - "tool_execution", - "shield_call", - "memory_retrieval" - ] + "const": "memory_retrieval", + "default": "memory_retrieval" }, - "step_id": { + "vector_db_ids": { "type": "string" }, - "delta": { - "$ref": "#/components/schemas/ContentDelta" + "inserted_context": { + "$ref": "#/components/schemas/InterleavedContent" } }, "additionalProperties": false, "required": [ - "event_type", - "step_type", + "turn_id", "step_id", - "delta" + "step_type", + "vector_db_ids", + "inserted_context" ] }, - "AgentTurnResponseStepStartPayload": { + "SafetyViolation": { "type": "object", "properties": { - "event_type": { - "type": "string", - "const": "step_start", - "default": "step_start" - }, - "step_type": { - "type": "string", - "enum": [ - "inference", - "tool_execution", - "shield_call", - "memory_retrieval" - ] + "violation_level": { + "$ref": "#/components/schemas/ViolationLevel" }, - "step_id": { + "user_message": { "type": "string" }, "metadata": { @@ -4820,416 +4794,384 @@ }, "additionalProperties": false, "required": [ - "event_type", - "step_type", - "step_id" - ] - }, - "AgentTurnResponseStreamChunk": { - "type": "object", - "properties": { - "event": { - "$ref": "#/components/schemas/AgentTurnResponseEvent" - } - }, - "additionalProperties": false, - "required": [ - "event" - ], - "description": "streamed agent turn completion response." - }, - "AgentTurnResponseTurnCompletePayload": { - "type": "object", - "properties": { - "event_type": { - "type": "string", - "const": "turn_complete", - "default": "turn_complete" - }, - "turn": { - "$ref": "#/components/schemas/Turn" - } - }, - "additionalProperties": false, - "required": [ - "event_type", - "turn" + "violation_level", + "metadata" ] }, - "AgentTurnResponseTurnStartPayload": { + "ShieldCallStep": { "type": "object", "properties": { - "event_type": { - "type": "string", - "const": "turn_start", - "default": "turn_start" - }, "turn_id": { "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "event_type", - "turn_id" - ] - }, - "EmbeddingsRequest": { - "type": "object", - "properties": { - "model_id": { + }, + "step_id": { + "type": "string" + }, + "started_at": { "type": "string", - "description": "The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint." + "format": "date-time" }, - "contents": { - "type": "array", - "items": { - "$ref": "#/components/schemas/InterleavedContent" - }, - "description": "List of contents to generate embeddings for. Note that content can be multimodal. The behavior depends on the model and provider. Some models may only support text." - } - }, - "additionalProperties": false, - "required": [ - "model_id", - "contents" - ] - }, - "EmbeddingsResponse": { - "type": "object", - "properties": { - "embeddings": { - "type": "array", - "items": { - "type": "array", - "items": { - "type": "number" - } - }, - "description": "List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}" - } - }, - "additionalProperties": false, - "required": [ - "embeddings" - ], - "description": "Response containing generated embeddings." - }, - "AgentCandidate": { - "type": "object", - "properties": { - "type": { + "completed_at": { "type": "string", - "const": "agent", - "default": "agent" + "format": "date-time" }, - "config": { - "$ref": "#/components/schemas/AgentConfig" - } - }, - "additionalProperties": false, - "required": [ - "type", - "config" - ] - }, - "AggregationFunctionType": { - "type": "string", - "enum": [ - "average", - "median", - "categorical_count", - "accuracy" - ] - }, - "BasicScoringFnParams": { - "type": "object", - "properties": { - "type": { + "step_type": { "type": "string", - "const": "basic", - "default": "basic" + "const": "shield_call", + "default": "shield_call" }, - "aggregation_functions": { - "type": "array", - "items": { - "$ref": "#/components/schemas/AggregationFunctionType" - } + "violation": { + "$ref": "#/components/schemas/SafetyViolation" } }, "additionalProperties": false, "required": [ - "type" + "turn_id", + "step_id", + "step_type" ] }, - "BenchmarkConfig": { + "ToolExecutionStep": { "type": "object", "properties": { - "eval_candidate": { - "$ref": "#/components/schemas/EvalCandidate" - }, - "scoring_params": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringFnParams" - } + "turn_id": { + "type": "string" }, - "num_examples": { - "type": "integer" - } - }, - "additionalProperties": false, - "required": [ - "eval_candidate", - "scoring_params" - ] - }, - "EvalCandidate": { - "oneOf": [ - { - "$ref": "#/components/schemas/ModelCandidate" + "step_id": { + "type": "string" }, - { - "$ref": "#/components/schemas/AgentCandidate" - } - ], - "discriminator": { - "propertyName": "type", - "mapping": { - "model": "#/components/schemas/ModelCandidate", - "agent": "#/components/schemas/AgentCandidate" - } - } - }, - "LLMAsJudgeScoringFnParams": { - "type": "object", - "properties": { - "type": { + "started_at": { "type": "string", - "const": "llm_as_judge", - "default": "llm_as_judge" + "format": "date-time" }, - "judge_model": { - "type": "string" + "completed_at": { + "type": "string", + "format": "date-time" }, - "prompt_template": { - "type": "string" + "step_type": { + "type": "string", + "const": "tool_execution", + "default": "tool_execution" }, - "judge_score_regexes": { + "tool_calls": { "type": "array", "items": { - "type": "string" + "$ref": "#/components/schemas/ToolCall" } }, - "aggregation_functions": { + "tool_responses": { "type": "array", "items": { - "$ref": "#/components/schemas/AggregationFunctionType" + "$ref": "#/components/schemas/ToolResponse" } } }, "additionalProperties": false, "required": [ - "type", - "judge_model" + "turn_id", + "step_id", + "step_type", + "tool_calls", + "tool_responses" ] }, - "ModelCandidate": { + "ToolResponse": { "type": "object", "properties": { - "type": { - "type": "string", - "const": "model", - "default": "model" - }, - "model": { + "call_id": { "type": "string" }, - "sampling_params": { - "$ref": "#/components/schemas/SamplingParams" + "tool_name": { + "oneOf": [ + { + "type": "string", + "enum": [ + "brave_search", + "wolfram_alpha", + "photogen", + "code_interpreter" + ] + }, + { + "type": "string" + } + ] }, - "system_message": { - "$ref": "#/components/schemas/SystemMessage" + "content": { + "$ref": "#/components/schemas/InterleavedContent" } }, "additionalProperties": false, "required": [ - "type", - "model", - "sampling_params" + "call_id", + "tool_name", + "content" ] }, - "RegexParserScoringFnParams": { + "Turn": { "type": "object", "properties": { - "type": { - "type": "string", - "const": "regex_parser", - "default": "regex_parser" + "turn_id": { + "type": "string" }, - "parsing_regexes": { + "session_id": { + "type": "string" + }, + "input_messages": { "type": "array", "items": { - "type": "string" + "oneOf": [ + { + "$ref": "#/components/schemas/UserMessage" + }, + { + "$ref": "#/components/schemas/ToolResponseMessage" + } + ] } }, - "aggregation_functions": { + "steps": { "type": "array", "items": { - "$ref": "#/components/schemas/AggregationFunctionType" + "oneOf": [ + { + "$ref": "#/components/schemas/InferenceStep" + }, + { + "$ref": "#/components/schemas/ToolExecutionStep" + }, + { + "$ref": "#/components/schemas/ShieldCallStep" + }, + { + "$ref": "#/components/schemas/MemoryRetrievalStep" + } + ], + "discriminator": { + "propertyName": "step_type", + "mapping": { + "inference": "#/components/schemas/InferenceStep", + "tool_execution": "#/components/schemas/ToolExecutionStep", + "shield_call": "#/components/schemas/ShieldCallStep", + "memory_retrieval": "#/components/schemas/MemoryRetrievalStep" + } + } + } + }, + "output_message": { + "$ref": "#/components/schemas/CompletionMessage" + }, + "output_attachments": { + "type": "array", + "items": { + "type": "object", + "properties": { + "content": { + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/components/schemas/InterleavedContentItem" + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/InterleavedContentItem" + } + }, + { + "$ref": "#/components/schemas/URL" + } + ] + }, + "mime_type": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "content", + "mime_type" + ] } + }, + "started_at": { + "type": "string", + "format": "date-time" + }, + "completed_at": { + "type": "string", + "format": "date-time" } }, "additionalProperties": false, "required": [ - "type" + "turn_id", + "session_id", + "input_messages", + "steps", + "output_message", + "started_at" + ], + "description": "A single turn in an interaction with an Agentic System." + }, + "ViolationLevel": { + "type": "string", + "enum": [ + "info", + "warn", + "error" ] }, - "ScoringFnParams": { + "AgentTurnResponseEvent": { + "type": "object", + "properties": { + "payload": { + "$ref": "#/components/schemas/AgentTurnResponseEventPayload" + } + }, + "additionalProperties": false, + "required": [ + "payload" + ] + }, + "AgentTurnResponseEventPayload": { "oneOf": [ { - "$ref": "#/components/schemas/LLMAsJudgeScoringFnParams" + "$ref": "#/components/schemas/AgentTurnResponseStepStartPayload" }, { - "$ref": "#/components/schemas/RegexParserScoringFnParams" + "$ref": "#/components/schemas/AgentTurnResponseStepProgressPayload" }, { - "$ref": "#/components/schemas/BasicScoringFnParams" + "$ref": "#/components/schemas/AgentTurnResponseStepCompletePayload" + }, + { + "$ref": "#/components/schemas/AgentTurnResponseTurnStartPayload" + }, + { + "$ref": "#/components/schemas/AgentTurnResponseTurnCompletePayload" } ], "discriminator": { - "propertyName": "type", + "propertyName": "event_type", "mapping": { - "llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams", - "regex_parser": "#/components/schemas/RegexParserScoringFnParams", - "basic": "#/components/schemas/BasicScoringFnParams" + "step_start": "#/components/schemas/AgentTurnResponseStepStartPayload", + "step_progress": "#/components/schemas/AgentTurnResponseStepProgressPayload", + "step_complete": "#/components/schemas/AgentTurnResponseStepCompletePayload", + "turn_start": "#/components/schemas/AgentTurnResponseTurnStartPayload", + "turn_complete": "#/components/schemas/AgentTurnResponseTurnCompletePayload" } } }, - "EvaluateRowsRequest": { + "AgentTurnResponseStepCompletePayload": { "type": "object", "properties": { - "input_rows": { - "type": "array", - "items": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] + "event_type": { + "type": "string", + "const": "step_complete", + "default": "step_complete" + }, + "step_type": { + "type": "string", + "enum": [ + "inference", + "tool_execution", + "shield_call", + "memory_retrieval" + ] + }, + "step_id": { + "type": "string" + }, + "step_details": { + "oneOf": [ + { + "$ref": "#/components/schemas/InferenceStep" + }, + { + "$ref": "#/components/schemas/ToolExecutionStep" + }, + { + "$ref": "#/components/schemas/ShieldCallStep" + }, + { + "$ref": "#/components/schemas/MemoryRetrievalStep" + } + ], + "discriminator": { + "propertyName": "step_type", + "mapping": { + "inference": "#/components/schemas/InferenceStep", + "tool_execution": "#/components/schemas/ToolExecutionStep", + "shield_call": "#/components/schemas/ShieldCallStep", + "memory_retrieval": "#/components/schemas/MemoryRetrievalStep" } } - }, - "scoring_functions": { - "type": "array", - "items": { - "type": "string" - } - }, - "task_config": { - "$ref": "#/components/schemas/BenchmarkConfig" } }, "additionalProperties": false, "required": [ - "input_rows", - "scoring_functions", - "task_config" + "event_type", + "step_type", + "step_id", + "step_details" ] }, - "EvaluateResponse": { + "AgentTurnResponseStepProgressPayload": { "type": "object", "properties": { - "generations": { - "type": "array", - "items": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - } + "event_type": { + "type": "string", + "const": "step_progress", + "default": "step_progress" }, - "scores": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringResult" - } + "step_type": { + "type": "string", + "enum": [ + "inference", + "tool_execution", + "shield_call", + "memory_retrieval" + ] + }, + "step_id": { + "type": "string" + }, + "delta": { + "$ref": "#/components/schemas/ContentDelta" } }, "additionalProperties": false, "required": [ - "generations", - "scores" + "event_type", + "step_type", + "step_id", + "delta" ] }, - "ScoringResult": { + "AgentTurnResponseStepStartPayload": { "type": "object", "properties": { - "score_rows": { - "type": "array", - "items": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - } + "event_type": { + "type": "string", + "const": "step_start", + "default": "step_start" }, - "aggregated_results": { + "step_type": { + "type": "string", + "enum": [ + "inference", + "tool_execution", + "shield_call", + "memory_retrieval" + ] + }, + "step_id": { + "type": "string" + }, + "metadata": { "type": "object", "additionalProperties": { "oneOf": [ @@ -5257,11 +5199,102 @@ }, "additionalProperties": false, "required": [ - "score_rows", - "aggregated_results" + "event_type", + "step_type", + "step_id" + ] + }, + "AgentTurnResponseStreamChunk": { + "type": "object", + "properties": { + "event": { + "$ref": "#/components/schemas/AgentTurnResponseEvent" + } + }, + "additionalProperties": false, + "required": [ + "event" + ], + "description": "streamed agent turn completion response." + }, + "AgentTurnResponseTurnCompletePayload": { + "type": "object", + "properties": { + "event_type": { + "type": "string", + "const": "turn_complete", + "default": "turn_complete" + }, + "turn": { + "$ref": "#/components/schemas/Turn" + } + }, + "additionalProperties": false, + "required": [ + "event_type", + "turn" + ] + }, + "AgentTurnResponseTurnStartPayload": { + "type": "object", + "properties": { + "event_type": { + "type": "string", + "const": "turn_start", + "default": "turn_start" + }, + "turn_id": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "event_type", + "turn_id" + ] + }, + "EmbeddingsRequest": { + "type": "object", + "properties": { + "model_id": { + "type": "string", + "description": "The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint." + }, + "contents": { + "type": "array", + "items": { + "$ref": "#/components/schemas/InterleavedContent" + }, + "description": "List of contents to generate embeddings for. Note that content can be multimodal. The behavior depends on the model and provider. Some models may only support text." + } + }, + "additionalProperties": false, + "required": [ + "model_id", + "contents" ] }, - "EvaluateRowsDeprecatedRequest": { + "EmbeddingsResponse": { + "type": "object", + "properties": { + "embeddings": { + "type": "array", + "items": { + "type": "array", + "items": { + "type": "number" + } + }, + "description": "List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}" + } + }, + "additionalProperties": false, + "required": [ + "embeddings" + ], + "description": "Response containing generated embeddings." + }, + "EvaluateRowsRequest": { "type": "object", "properties": { "input_rows": { @@ -6165,15 +6198,6 @@ ], "description": "Artifacts of a finetuning job." }, - "JobStatus": { - "type": "string", - "enum": [ - "completed", - "in_progress", - "failed", - "scheduled" - ] - }, "PostTrainingJobStatusResponse": { "type": "object", "properties": { @@ -7812,30 +7836,6 @@ "task_config" ] }, - "Job": { - "type": "object", - "properties": { - "job_id": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "job_id" - ] - }, - "RunEvalDeprecatedRequest": { - "type": "object", - "properties": { - "task_config": { - "$ref": "#/components/schemas/BenchmarkConfig" - } - }, - "additionalProperties": false, - "required": [ - "task_config" - ] - }, "RunShieldRequest": { "type": "object", "properties": { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 89e0669177..83bc5483c8 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -10,6 +10,31 @@ info: servers: - url: http://any-hosted-llama-stack.com paths: + /v1/eval/tasks/{task_id}/evaluations: + post: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/EvaluateResponse' + tags: + - Eval + description: '' + parameters: + - name: task_id + in: path + required: true + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/DeprecatedEvaluateRowsRequest' + required: true + deprecated: true /v1/eval-tasks/{benchmark_id}: get: responses: @@ -31,6 +56,75 @@ paths: schema: type: string deprecated: true + /v1/eval/tasks/{task_id}/jobs/{job_id}: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + oneOf: + - $ref: '#/components/schemas/JobStatus' + - type: 'null' + tags: + - Eval + description: '' + parameters: + - name: task_id + in: path + required: true + schema: + type: string + - name: job_id + in: path + required: true + schema: + type: string + deprecated: true + delete: + responses: + '200': + description: OK + tags: + - Eval + description: '' + parameters: + - name: task_id + in: path + required: true + schema: + type: string + - name: job_id + in: path + required: true + schema: + type: string + deprecated: true + /v1/eval/tasks/{task_id}/jobs/{job_id}/result: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/EvaluateResponse' + tags: + - Eval + description: '' + parameters: + - name: task_id + in: path + required: true + schema: + type: string + - name: job_id + in: path + required: true + schema: + type: string + deprecated: true /v1/eval-tasks: get: responses: @@ -60,6 +154,31 @@ paths: $ref: '#/components/schemas/DeprecatedRegisterEvalTaskRequest' required: true deprecated: true + /v1/eval/tasks/{task_id}/jobs: + post: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/Job' + tags: + - Eval + description: '' + parameters: + - name: task_id + in: path + required: true + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/DeprecatedRunEvalRequest' + required: true + deprecated: true /v1/datasetio/rows: get: responses: @@ -396,31 +515,6 @@ paths: schema: $ref: '#/components/schemas/EvaluateRowsRequest' required: true - /v1/eval/tasks/{task_id}/evaluations: - post: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluateResponse' - tags: - - Eval - description: '' - parameters: - - name: task_id - in: path - required: true - schema: - type: string - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluateRowsDeprecatedRequest' - required: true - deprecated: true /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}: get: responses: @@ -920,51 +1014,6 @@ paths: required: true schema: type: string - /v1/eval/tasks/{task_id}/jobs/{job_id}: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - oneOf: - - $ref: '#/components/schemas/JobStatus' - - type: 'null' - tags: - - Eval - description: '' - parameters: - - name: task_id - in: path - required: true - schema: - type: string - - name: job_id - in: path - required: true - schema: - type: string - deprecated: true - delete: - responses: - '200': - description: OK - tags: - - Eval - description: '' - parameters: - - name: task_id - in: path - required: true - schema: - type: string - - name: job_id - in: path - required: true - schema: - type: string - deprecated: true /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result: get: responses: @@ -988,30 +1037,6 @@ paths: required: true schema: type: string - /v1/eval/tasks/{task_id}/jobs/{job_id}/result: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluateResponse' - tags: - - Eval - description: '' - parameters: - - name: task_id - in: path - required: true - schema: - type: string - - name: job_id - in: path - required: true - schema: - type: string - deprecated: true /v1/eval/benchmarks: get: responses: @@ -1446,31 +1471,6 @@ paths: schema: $ref: '#/components/schemas/RunEvalRequest' required: true - /v1/eval/tasks/{task_id}/jobs: - post: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/Job' - tags: - - Eval - description: '' - parameters: - - name: task_id - in: path - required: true - schema: - type: string - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RunEvalDeprecatedRequest' - required: true - deprecated: true /v1/safety/run-shield: post: responses: @@ -1598,143 +1598,142 @@ jsonSchemaDialect: >- https://json-schema.org/draft/2020-12/schema components: schemas: - Benchmark: + AgentCandidate: type: object properties: - identifier: - type: string - provider_resource_id: - type: string - provider_id: - type: string type: type: string - const: benchmark - default: benchmark - dataset_id: - type: string - scoring_functions: - type: array - items: - type: string - metadata: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object + const: agent + default: agent + config: + $ref: '#/components/schemas/AgentConfig' additionalProperties: false required: - - identifier - - provider_resource_id - - provider_id - type - - dataset_id - - scoring_functions - - metadata - ListBenchmarksResponse: + - config + AgentConfig: type: object properties: - data: + sampling_params: + $ref: '#/components/schemas/SamplingParams' + input_shields: type: array items: - $ref: '#/components/schemas/Benchmark' - additionalProperties: false - required: - - data - DeprecatedRegisterEvalTaskRequest: - type: object - properties: - task_id: - type: string - dataset_id: - type: string - scoring_functions: + type: string + output_shields: type: array items: type: string - provider_benchmark_id: + toolgroups: + type: array + items: + $ref: '#/components/schemas/AgentTool' + client_tools: + type: array + items: + $ref: '#/components/schemas/ToolDef' + tool_choice: type: string - provider_id: + enum: + - auto + - required + description: >- + Whether tool use is required or automatic. This is a hint to the model + which may not be followed. It depends on the Instruction Following capabilities + of the model. + default: auto + tool_prompt_format: type: string - metadata: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object + enum: + - json + - function_tag + - python_list + description: >- + Prompt format for calling custom / zero shot tools. + tool_config: + $ref: '#/components/schemas/ToolConfig' + max_infer_iters: + type: integer + default: 10 + model: + type: string + instructions: + type: string + enable_session_persistence: + type: boolean + response_format: + $ref: '#/components/schemas/ResponseFormat' additionalProperties: false required: - - task_id - - dataset_id - - scoring_functions - AppendRowsRequest: + - model + - instructions + - enable_session_persistence + AgentTool: + oneOf: + - type: string + - type: object + properties: + name: + type: string + args: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - name + - args + AggregationFunctionType: + type: string + enum: + - average + - median + - categorical_count + - accuracy + BasicScoringFnParams: type: object properties: - dataset_id: + type: type: string - rows: + const: basic + default: basic + aggregation_functions: type: array items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object + $ref: '#/components/schemas/AggregationFunctionType' additionalProperties: false required: - - dataset_id - - rows - CompletionMessage: + - type + BenchmarkConfig: type: object properties: - role: - type: string - const: assistant - default: assistant - description: >- - Must be "assistant" to identify this as the model's response - content: - $ref: '#/components/schemas/InterleavedContent' - description: The content of the model's response - stop_reason: - type: string - enum: - - end_of_turn - - end_of_message - - out_of_tokens - description: >- - Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: - The model finished generating the entire response. - `StopReason.end_of_message`: - The model finished generating but generated a partial response -- usually, - a tool call. The user may call the tool and continue the conversation - with the tool's response. - `StopReason.out_of_tokens`: The model ran - out of token budget. - tool_calls: - type: array - items: - $ref: '#/components/schemas/ToolCall' - description: >- - List of tool calls. Each tool call is a ToolCall object. + eval_candidate: + $ref: '#/components/schemas/EvalCandidate' + scoring_params: + type: object + additionalProperties: + $ref: '#/components/schemas/ScoringFnParams' + num_examples: + type: integer additionalProperties: false required: - - role - - content - - stop_reason - description: >- - A message containing the model's (assistant) response in a chat conversation. + - eval_candidate + - scoring_params + EvalCandidate: + oneOf: + - $ref: '#/components/schemas/ModelCandidate' + - $ref: '#/components/schemas/AgentCandidate' + discriminator: + propertyName: type + mapping: + model: '#/components/schemas/ModelCandidate' + agent: '#/components/schemas/AgentCandidate' GrammarResponseFormat: type: object properties: @@ -1845,56 +1844,113 @@ components: - json_schema description: >- Configuration for JSON schema-guided response generation. - Message: - oneOf: - - $ref: '#/components/schemas/UserMessage' - - $ref: '#/components/schemas/SystemMessage' - - $ref: '#/components/schemas/ToolResponseMessage' - - $ref: '#/components/schemas/CompletionMessage' - discriminator: - propertyName: role - mapping: - user: '#/components/schemas/UserMessage' - system: '#/components/schemas/SystemMessage' - tool: '#/components/schemas/ToolResponseMessage' - assistant: '#/components/schemas/CompletionMessage' - ResponseFormat: - oneOf: - - $ref: '#/components/schemas/JsonSchemaResponseFormat' - - $ref: '#/components/schemas/GrammarResponseFormat' - discriminator: - propertyName: type - mapping: - json_schema: '#/components/schemas/JsonSchemaResponseFormat' - grammar: '#/components/schemas/GrammarResponseFormat' - SamplingParams: + LLMAsJudgeScoringFnParams: type: object properties: - strategy: - $ref: '#/components/schemas/SamplingStrategy' - max_tokens: - type: integer - default: 0 - repetition_penalty: - type: number - default: 1.0 + type: + type: string + const: llm_as_judge + default: llm_as_judge + judge_model: + type: string + prompt_template: + type: string + judge_score_regexes: + type: array + items: + type: string + aggregation_functions: + type: array + items: + $ref: '#/components/schemas/AggregationFunctionType' additionalProperties: false required: - - strategy - SamplingStrategy: - oneOf: - - $ref: '#/components/schemas/GreedySamplingStrategy' - - $ref: '#/components/schemas/TopPSamplingStrategy' - - $ref: '#/components/schemas/TopKSamplingStrategy' - discriminator: - propertyName: type - mapping: - greedy: '#/components/schemas/GreedySamplingStrategy' - top_p: '#/components/schemas/TopPSamplingStrategy' - top_k: '#/components/schemas/TopKSamplingStrategy' - SystemMessage: - type: object - properties: + - type + - judge_model + ModelCandidate: + type: object + properties: + type: + type: string + const: model + default: model + model: + type: string + sampling_params: + $ref: '#/components/schemas/SamplingParams' + system_message: + $ref: '#/components/schemas/SystemMessage' + additionalProperties: false + required: + - type + - model + - sampling_params + RegexParserScoringFnParams: + type: object + properties: + type: + type: string + const: regex_parser + default: regex_parser + parsing_regexes: + type: array + items: + type: string + aggregation_functions: + type: array + items: + $ref: '#/components/schemas/AggregationFunctionType' + additionalProperties: false + required: + - type + ResponseFormat: + oneOf: + - $ref: '#/components/schemas/JsonSchemaResponseFormat' + - $ref: '#/components/schemas/GrammarResponseFormat' + discriminator: + propertyName: type + mapping: + json_schema: '#/components/schemas/JsonSchemaResponseFormat' + grammar: '#/components/schemas/GrammarResponseFormat' + SamplingParams: + type: object + properties: + strategy: + $ref: '#/components/schemas/SamplingStrategy' + max_tokens: + type: integer + default: 0 + repetition_penalty: + type: number + default: 1.0 + additionalProperties: false + required: + - strategy + SamplingStrategy: + oneOf: + - $ref: '#/components/schemas/GreedySamplingStrategy' + - $ref: '#/components/schemas/TopPSamplingStrategy' + - $ref: '#/components/schemas/TopKSamplingStrategy' + discriminator: + propertyName: type + mapping: + greedy: '#/components/schemas/GreedySamplingStrategy' + top_p: '#/components/schemas/TopPSamplingStrategy' + top_k: '#/components/schemas/TopKSamplingStrategy' + ScoringFnParams: + oneOf: + - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams' + - $ref: '#/components/schemas/RegexParserScoringFnParams' + - $ref: '#/components/schemas/BasicScoringFnParams' + discriminator: + propertyName: type + mapping: + llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams' + regex_parser: '#/components/schemas/RegexParserScoringFnParams' + basic: '#/components/schemas/BasicScoringFnParams' + SystemMessage: + type: object + properties: role: type: string const: system @@ -1921,15 +1977,392 @@ components: const: text default: text description: >- - Discriminator type of the content item. Always "text" - text: - type: string - description: Text content + Discriminator type of the content item. Always "text" + text: + type: string + description: Text content + additionalProperties: false + required: + - type + - text + description: A text content item + ToolConfig: + type: object + properties: + tool_choice: + type: string + enum: + - auto + - required + description: >- + (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. + default: auto + tool_prompt_format: + type: string + enum: + - json + - function_tag + - python_list + description: >- + (Optional) Instructs the model how to format tool calls. By default, Llama + Stack will attempt to use a format that is best adapted to the model. + - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. + - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a + tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python + syntax -- a list of function calls. + system_message_behavior: + type: string + enum: + - append + - replace + description: >- + (Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: + Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: + Replaces the default system prompt with the provided system message. The + system message can include the string '{{function_definitions}}' to indicate + where the function definitions should be inserted. + default: append + additionalProperties: false + required: + - system_message_behavior + description: Configuration for tool use. + ToolDef: + type: object + properties: + name: + type: string + description: + type: string + parameters: + type: array + items: + $ref: '#/components/schemas/ToolParameter' + metadata: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - name + ToolParameter: + type: object + properties: + name: + type: string + parameter_type: + type: string + description: + type: string + required: + type: boolean + default: true + default: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - name + - parameter_type + - description + - required + TopKSamplingStrategy: + type: object + properties: + type: + type: string + const: top_k + default: top_k + top_k: + type: integer + additionalProperties: false + required: + - type + - top_k + TopPSamplingStrategy: + type: object + properties: + type: + type: string + const: top_p + default: top_p + temperature: + type: number + top_p: + type: number + default: 0.95 + additionalProperties: false + required: + - type + URL: + type: object + properties: + uri: + type: string + additionalProperties: false + required: + - uri + DeprecatedEvaluateRowsRequest: + type: object + properties: + input_rows: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + scoring_functions: + type: array + items: + type: string + task_config: + $ref: '#/components/schemas/BenchmarkConfig' + additionalProperties: false + required: + - input_rows + - scoring_functions + - task_config + EvaluateResponse: + type: object + properties: + generations: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + scores: + type: object + additionalProperties: + $ref: '#/components/schemas/ScoringResult' + additionalProperties: false + required: + - generations + - scores + ScoringResult: + type: object + properties: + score_rows: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + aggregated_results: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - score_rows + - aggregated_results + Benchmark: + type: object + properties: + identifier: + type: string + provider_resource_id: + type: string + provider_id: + type: string + type: + type: string + const: benchmark + default: benchmark + dataset_id: + type: string + scoring_functions: + type: array + items: + type: string + metadata: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - identifier + - provider_resource_id + - provider_id + - type + - dataset_id + - scoring_functions + - metadata + JobStatus: + type: string + enum: + - completed + - in_progress + - failed + - scheduled + ListBenchmarksResponse: + type: object + properties: + data: + type: array + items: + $ref: '#/components/schemas/Benchmark' + additionalProperties: false + required: + - data + DeprecatedRegisterEvalTaskRequest: + type: object + properties: + task_id: + type: string + dataset_id: + type: string + scoring_functions: + type: array + items: + type: string + provider_benchmark_id: + type: string + provider_id: + type: string + metadata: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - task_id + - dataset_id + - scoring_functions + DeprecatedRunEvalRequest: + type: object + properties: + task_config: + $ref: '#/components/schemas/BenchmarkConfig' + additionalProperties: false + required: + - task_config + Job: + type: object + properties: + job_id: + type: string + additionalProperties: false + required: + - job_id + AppendRowsRequest: + type: object + properties: + dataset_id: + type: string + rows: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - dataset_id + - rows + CompletionMessage: + type: object + properties: + role: + type: string + const: assistant + default: assistant + description: >- + Must be "assistant" to identify this as the model's response + content: + $ref: '#/components/schemas/InterleavedContent' + description: The content of the model's response + stop_reason: + type: string + enum: + - end_of_turn + - end_of_message + - out_of_tokens + description: >- + Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: + The model finished generating the entire response. - `StopReason.end_of_message`: + The model finished generating but generated a partial response -- usually, + a tool call. The user may call the tool and continue the conversation + with the tool's response. - `StopReason.out_of_tokens`: The model ran + out of token budget. + tool_calls: + type: array + items: + $ref: '#/components/schemas/ToolCall' + description: >- + List of tool calls. Each tool call is a ToolCall object. additionalProperties: false required: - - type - - text - description: A text content item + - role + - content + - stop_reason + description: >- + A message containing the model's (assistant) response in a chat conversation. + Message: + oneOf: + - $ref: '#/components/schemas/UserMessage' + - $ref: '#/components/schemas/SystemMessage' + - $ref: '#/components/schemas/ToolResponseMessage' + - $ref: '#/components/schemas/CompletionMessage' + discriminator: + propertyName: role + mapping: + user: '#/components/schemas/UserMessage' + system: '#/components/schemas/SystemMessage' + tool: '#/components/schemas/ToolResponseMessage' + assistant: '#/components/schemas/CompletionMessage' ToolCall: type: object properties: @@ -2050,42 +2483,6 @@ components: - content description: >- A message representing the result of a tool invocation. - TopKSamplingStrategy: - type: object - properties: - type: - type: string - const: top_k - default: top_k - top_k: - type: integer - additionalProperties: false - required: - - type - - top_k - TopPSamplingStrategy: - type: object - properties: - type: - type: string - const: top_p - default: top_p - temperature: - type: number - top_p: - type: number - default: 0.95 - additionalProperties: false - required: - - type - URL: - type: object - properties: - uri: - type: string - additionalProperties: false - required: - - uri UserMessage: type: object properties: @@ -2266,46 +2663,6 @@ components: additionalProperties: false required: - job_uuid - ToolConfig: - type: object - properties: - tool_choice: - type: string - enum: - - auto - - required - description: >- - (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. - default: auto - tool_prompt_format: - type: string - enum: - - json - - function_tag - - python_list - description: >- - (Optional) Instructs the model how to format tool calls. By default, Llama - Stack will attempt to use a format that is best adapted to the model. - - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a - tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python - syntax -- a list of function calls. - system_message_behavior: - type: string - enum: - - append - - replace - description: >- - (Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: - Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: - Replaces the default system prompt with the provided system message. The - system message can include the string '{{function_definitions}}' to indicate - where the function definitions should be inserted. - default: append - additionalProperties: false - required: - - system_message_behavior - description: Configuration for tool use. ChatCompletionRequest: type: object properties: @@ -2528,161 +2885,33 @@ components: additionalProperties: false required: - model_id - - content - CompletionResponseStreamChunk: - type: object - properties: - delta: - type: string - description: >- - New content generated since last chunk. This can be one or more tokens. - stop_reason: - type: string - enum: - - end_of_turn - - end_of_message - - out_of_tokens - description: >- - Optional reason why generation stopped, if complete - logprobs: - type: array - items: - $ref: '#/components/schemas/TokenLogProbs' - description: >- - Optional log probabilities for generated tokens - additionalProperties: false - required: - - delta - description: >- - A chunk of a streamed completion response. - AgentConfig: - type: object - properties: - sampling_params: - $ref: '#/components/schemas/SamplingParams' - input_shields: - type: array - items: - type: string - output_shields: - type: array - items: - type: string - toolgroups: - type: array - items: - $ref: '#/components/schemas/AgentTool' - client_tools: - type: array - items: - $ref: '#/components/schemas/ToolDef' - tool_choice: - type: string - enum: - - auto - - required - description: >- - Whether tool use is required or automatic. This is a hint to the model - which may not be followed. It depends on the Instruction Following capabilities - of the model. - default: auto - tool_prompt_format: - type: string - enum: - - json - - function_tag - - python_list - description: >- - Prompt format for calling custom / zero shot tools. - tool_config: - $ref: '#/components/schemas/ToolConfig' - max_infer_iters: - type: integer - default: 10 - model: - type: string - instructions: - type: string - enable_session_persistence: - type: boolean - response_format: - $ref: '#/components/schemas/ResponseFormat' - additionalProperties: false - required: - - model - - instructions - - enable_session_persistence - AgentTool: - oneOf: - - type: string - - type: object - properties: - name: - type: string - args: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - additionalProperties: false - required: - - name - - args - ToolDef: - type: object - properties: - name: - type: string - description: - type: string - parameters: - type: array - items: - $ref: '#/components/schemas/ToolParameter' - metadata: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - additionalProperties: false - required: - - name - ToolParameter: - type: object - properties: - name: - type: string - parameter_type: - type: string - description: - type: string - required: - type: boolean - default: true - default: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object + - content + CompletionResponseStreamChunk: + type: object + properties: + delta: + type: string + description: >- + New content generated since last chunk. This can be one or more tokens. + stop_reason: + type: string + enum: + - end_of_turn + - end_of_message + - out_of_tokens + description: >- + Optional reason why generation stopped, if complete + logprobs: + type: array + items: + $ref: '#/components/schemas/TokenLogProbs' + description: >- + Optional log probabilities for generated tokens additionalProperties: false required: - - name - - parameter_type - - description - - required + - delta + description: >- + A chunk of a streamed completion response. CreateAgentRequest: type: object properties: @@ -3162,134 +3391,6 @@ components: - embeddings description: >- Response containing generated embeddings. - AgentCandidate: - type: object - properties: - type: - type: string - const: agent - default: agent - config: - $ref: '#/components/schemas/AgentConfig' - additionalProperties: false - required: - - type - - config - AggregationFunctionType: - type: string - enum: - - average - - median - - categorical_count - - accuracy - BasicScoringFnParams: - type: object - properties: - type: - type: string - const: basic - default: basic - aggregation_functions: - type: array - items: - $ref: '#/components/schemas/AggregationFunctionType' - additionalProperties: false - required: - - type - BenchmarkConfig: - type: object - properties: - eval_candidate: - $ref: '#/components/schemas/EvalCandidate' - scoring_params: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringFnParams' - num_examples: - type: integer - additionalProperties: false - required: - - eval_candidate - - scoring_params - EvalCandidate: - oneOf: - - $ref: '#/components/schemas/ModelCandidate' - - $ref: '#/components/schemas/AgentCandidate' - discriminator: - propertyName: type - mapping: - model: '#/components/schemas/ModelCandidate' - agent: '#/components/schemas/AgentCandidate' - LLMAsJudgeScoringFnParams: - type: object - properties: - type: - type: string - const: llm_as_judge - default: llm_as_judge - judge_model: - type: string - prompt_template: - type: string - judge_score_regexes: - type: array - items: - type: string - aggregation_functions: - type: array - items: - $ref: '#/components/schemas/AggregationFunctionType' - additionalProperties: false - required: - - type - - judge_model - ModelCandidate: - type: object - properties: - type: - type: string - const: model - default: model - model: - type: string - sampling_params: - $ref: '#/components/schemas/SamplingParams' - system_message: - $ref: '#/components/schemas/SystemMessage' - additionalProperties: false - required: - - type - - model - - sampling_params - RegexParserScoringFnParams: - type: object - properties: - type: - type: string - const: regex_parser - default: regex_parser - parsing_regexes: - type: array - items: - type: string - aggregation_functions: - type: array - items: - $ref: '#/components/schemas/AggregationFunctionType' - additionalProperties: false - required: - - type - ScoringFnParams: - oneOf: - - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams' - - $ref: '#/components/schemas/RegexParserScoringFnParams' - - $ref: '#/components/schemas/BasicScoringFnParams' - discriminator: - propertyName: type - mapping: - llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams' - regex_parser: '#/components/schemas/RegexParserScoringFnParams' - basic: '#/components/schemas/BasicScoringFnParams' EvaluateRowsRequest: type: object properties: @@ -3316,84 +3417,6 @@ components: - input_rows - scoring_functions - task_config - EvaluateResponse: - type: object - properties: - generations: - type: array - items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - scores: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringResult' - additionalProperties: false - required: - - generations - - scores - ScoringResult: - type: object - properties: - score_rows: - type: array - items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - aggregated_results: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - additionalProperties: false - required: - - score_rows - - aggregated_results - EvaluateRowsDeprecatedRequest: - type: object - properties: - input_rows: - type: array - items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - scoring_functions: - type: array - items: - type: string - task_config: - $ref: '#/components/schemas/BenchmarkConfig' - additionalProperties: false - required: - - input_rows - - scoring_functions - - task_config Session: type: object properties: @@ -3925,13 +3948,6 @@ components: - job_uuid - checkpoints description: Artifacts of a finetuning job. - JobStatus: - type: string - enum: - - completed - - in_progress - - failed - - scheduled PostTrainingJobStatusResponse: type: object properties: @@ -4935,22 +4951,6 @@ components: additionalProperties: false required: - task_config - Job: - type: object - properties: - job_id: - type: string - additionalProperties: false - required: - - job_id - RunEvalDeprecatedRequest: - type: object - properties: - task_config: - $ref: '#/components/schemas/BenchmarkConfig' - additionalProperties: false - required: - - task_config RunShieldRequest: type: object properties: diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb index 599df201a0..857b7f1337 100644 --- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb +++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb @@ -1203,7 +1203,7 @@ ")\n", "\n", "response = client.eval.evaluate_rows(\n", - " benchmark_id=\"meta-reference::simpleqa\",\n", + " task_id=\"meta-reference::simpleqa\",\n", " input_rows=eval_rows.rows,\n", " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", " task_config={\n", @@ -1214,7 +1214,7 @@ " \"sampling_params\": {\n", " \"strategy\": {\n", " \"type\": \"greedy\",\n", - " },\n", + " },b\n", " \"max_tokens\": 4096,\n", " \"repeat_penalty\": 1.0,\n", " },\n", diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py index b805e49762..010189cc7c 100644 --- a/llama_stack/apis/eval/eval.py +++ b/llama_stack/apis/eval/eval.py @@ -85,14 +85,14 @@ async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ... async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ... @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST") - async def run_eval_DEPRECATED( + async def DEPRECATED_run_eval( self, task_id: str, task_config: BenchmarkConfig, ) -> Job: ... @webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST") - async def evaluate_rows_DEPRECATED( + async def DEPRECATED_evaluate_rows( self, task_id: str, input_rows: List[Dict[str, Any]], @@ -101,10 +101,10 @@ async def evaluate_rows_DEPRECATED( ) -> EvaluateResponse: ... @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET") - async def job_status_DEPRECATED(self, task_id: str, job_id: str) -> Optional[JobStatus]: ... + async def DEPRECATED_job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ... @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE") - async def job_cancel_DEPRECATED(self, task_id: str, job_id: str) -> None: ... + async def DEPRECATED_job_cancel(self, task_id: str, job_id: str) -> None: ... @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET") - async def job_result_DEPRECATED(self, task_id: str, job_id: str) -> EvaluateResponse: ... + async def DEPRECATED_job_result(self, task_id: str, job_id: str) -> EvaluateResponse: ... diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py index 9945ad367b..845010f543 100644 --- a/llama_stack/distribution/routers/routers.py +++ b/llama_stack/distribution/routers/routers.py @@ -395,6 +395,48 @@ async def job_result( job_id, ) + async def DEPRECATED_run_eval( + self, + task_id: str, + task_config: BenchmarkConfig, + ) -> Job: + return await self.run_eval(benchmark_id=task_id, task_config=task_config) + + async def DEPRECATED_evaluate_rows( + self, + task_id: str, + input_rows: List[Dict[str, Any]], + scoring_functions: List[str], + task_config: BenchmarkConfig, + ) -> EvaluateResponse: + return await self.evaluate_rows( + benchmark_id=task_id, + input_rows=input_rows, + scoring_functions=scoring_functions, + task_config=task_config, + ) + + async def DEPRECATED_job_status( + self, + task_id: str, + job_id: str, + ) -> Optional[JobStatus]: + return await self.job_status(benchmark_id=task_id, job_id=job_id) + + async def DEPRECATED_job_cancel( + self, + task_id: str, + job_id: str, + ) -> None: + return await self.job_cancel(benchmark_id=task_id, job_id=job_id) + + async def DEPRECATED_job_result( + self, + task_id: str, + job_id: str, + ) -> EvaluateResponse: + return await self.job_result(benchmark_id=task_id, job_id=job_id) + class ToolRuntimeRouter(ToolRuntime): class RagToolImpl(RAGToolRuntime): diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index 5d2da73372..b16becf1a9 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -471,18 +471,18 @@ async def register_benchmark( async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse: logger.warning("DEPRECATED: Use /eval/benchmarks instead") - raise DeprecationWarning("Use /eval/tasks instead") + return await self.list_benchmarks() async def DEPRECATED_get_eval_task( self, - benchmark_id: str, + task_id: str, ) -> Optional[Benchmark]: logger.warning("DEPRECATED: Use /eval/benchmarks instead") - raise DeprecationWarning("Use /eval/tasks instead") + return await self.get_benchmark(task_id) async def DEPRECATED_register_eval_task( self, - benchmark_id: str, + task_id: str, dataset_id: str, scoring_functions: List[str], provider_benchmark_id: Optional[str] = None, @@ -490,8 +490,8 @@ async def DEPRECATED_register_eval_task( metadata: Optional[Dict[str, Any]] = None, ) -> None: logger.warning("DEPRECATED: Use /eval/benchmarks instead") - self.register_benchmark( - benchmark_id=benchmark_id, + return await self.register_benchmark( + benchmark_id=task_id, dataset_id=dataset_id, scoring_functions=scoring_functions, metadata=metadata, diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py index a02418e741..3ae530a47a 100644 --- a/llama_stack/providers/inline/eval/meta_reference/eval.py +++ b/llama_stack/providers/inline/eval/meta_reference/eval.py @@ -234,3 +234,45 @@ async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: raise ValueError(f"Job is not completed, Status: {status.value}") return self.jobs[job_id] + + async def DEPRECATED_run_eval( + self, + task_id: str, + task_config: BenchmarkConfig, + ) -> Job: + return await self.run_eval(benchmark_id=task_id, task_config=task_config) + + async def DEPRECATED_evaluate_rows( + self, + task_id: str, + input_rows: List[Dict[str, Any]], + scoring_functions: List[str], + task_config: BenchmarkConfig, + ) -> EvaluateResponse: + return await self.evaluate_rows( + benchmark_id=task_id, + input_rows=input_rows, + scoring_functions=scoring_functions, + task_config=task_config, + ) + + async def DEPRECATED_job_status( + self, + task_id: str, + job_id: str, + ) -> Optional[JobStatus]: + return await self.job_status(benchmark_id=task_id, job_id=job_id) + + async def DEPRECATED_job_cancel( + self, + task_id: str, + job_id: str, + ) -> None: + return await self.job_cancel(benchmark_id=task_id, job_id=job_id) + + async def DEPRECATED_job_result( + self, + task_id: str, + job_id: str, + ) -> EvaluateResponse: + return await self.job_result(benchmark_id=task_id, job_id=job_id) From 1395de57a6de1499b4dda289535ea39b32b5b418 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Feb 2025 21:16:47 -0800 Subject: [PATCH 14/31] fix --- docs/_static/llama-stack-spec.html | 4 ++-- docs/_static/llama-stack-spec.yaml | 4 ++-- docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb | 14 +++++++------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 381f37f1f0..cba7829a18 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -81,7 +81,7 @@ "deprecated": true } }, - "/v1/eval-tasks/{benchmark_id}": { + "/v1/eval-tasks/{task_id}": { "get": { "responses": { "200": { @@ -109,7 +109,7 @@ "parameters": [ { "name": "task_id", - "in": "query", + "in": "path", "required": true, "schema": { "type": "string" diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 83bc5483c8..0bc4987764 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -35,7 +35,7 @@ paths: $ref: '#/components/schemas/DeprecatedEvaluateRowsRequest' required: true deprecated: true - /v1/eval-tasks/{benchmark_id}: + /v1/eval-tasks/{task_id}: get: responses: '200': @@ -51,7 +51,7 @@ paths: description: '' parameters: - name: task_id - in: query + in: path required: true schema: type: string diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb index 857b7f1337..8eecf84abb 100644 --- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb +++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb @@ -1017,14 +1017,14 @@ " \"content\": SYSTEM_PROMPT_TEMPLATE.format(subject=subset),\n", "}\n", "\n", - "client.benchmarks.register(\n", - " benchmark_id=\"meta-reference::mmmu\",\n", + "client.eval_tasks.register(\n", + " eval_task_id=\"meta-reference::mmmu\",\n", " dataset_id=f\"mmmu-{subset}-{split}\",\n", " scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n", ")\n", "\n", "response = client.eval.evaluate_rows(\n", - " benchmark_id=\"meta-reference::mmmu\",\n", + " task_id=\"meta-reference::mmmu\",\n", " input_rows=eval_rows,\n", " scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n", " task_config={\n", @@ -1196,8 +1196,8 @@ " provider_id=\"together\",\n", ")\n", "\n", - "client.benchmarks.register(\n", - " benchmark_id=\"meta-reference::simpleqa\",\n", + "client.eval_tasks.register(\n", + " eval_task_id=\"meta-reference::simpleqa\",\n", " dataset_id=simpleqa_dataset_id,\n", " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", ")\n", @@ -1214,7 +1214,7 @@ " \"sampling_params\": {\n", " \"strategy\": {\n", " \"type\": \"greedy\",\n", - " },b\n", + " },\n", " \"max_tokens\": 4096,\n", " \"repeat_penalty\": 1.0,\n", " },\n", @@ -1352,7 +1352,7 @@ "}\n", "\n", "response = client.eval.evaluate_rows(\n", - " benchmark_id=\"meta-reference::simpleqa\",\n", + " task_id=\"meta-reference::simpleqa\",\n", " input_rows=eval_rows.rows,\n", " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", " task_config={\n", From 10e8c964539e332ba45583acbdb41561376258ce Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Feb 2025 21:18:48 -0800 Subject: [PATCH 15/31] add benchmarks --- llama_stack/apis/benchmarks/__init__.py | 7 ++ llama_stack/apis/benchmarks/benchmarks.py | 86 +++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 llama_stack/apis/benchmarks/__init__.py create mode 100644 llama_stack/apis/benchmarks/benchmarks.py diff --git a/llama_stack/apis/benchmarks/__init__.py b/llama_stack/apis/benchmarks/__init__.py new file mode 100644 index 0000000000..f8f5649570 --- /dev/null +++ b/llama_stack/apis/benchmarks/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .benchmarks import * # noqa: F401 F403 diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py new file mode 100644 index 0000000000..75f2b3d053 --- /dev/null +++ b/llama_stack/apis/benchmarks/benchmarks.py @@ -0,0 +1,86 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable + +from llama_models.schema_utils import json_schema_type, webmethod +from pydantic import BaseModel, Field + +from llama_stack.apis.resource import Resource, ResourceType + + +class CommonBenchmarkFields(BaseModel): + dataset_id: str + scoring_functions: List[str] + metadata: Dict[str, Any] = Field( + default_factory=dict, + description="Metadata for this evaluation task", + ) + + +@json_schema_type +class Benchmark(CommonBenchmarkFields, Resource): + type: Literal[ResourceType.benchmark.value] = ResourceType.benchmark.value + + @property + def benchmark_id(self) -> str: + return self.identifier + + @property + def provider_benchmark_id(self) -> str: + return self.provider_resource_id + + +class BenchmarkInput(CommonBenchmarkFields, BaseModel): + benchmark_id: str + provider_id: Optional[str] = None + provider_benchmark_id: Optional[str] = None + + +class ListBenchmarksResponse(BaseModel): + data: List[Benchmark] + + +@runtime_checkable +class Benchmarks(Protocol): + @webmethod(route="/eval/benchmarks", method="GET") + async def list_benchmarks(self) -> ListBenchmarksResponse: ... + + @webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET") + async def get_benchmark( + self, + benchmark_id: str, + ) -> Optional[Benchmark]: ... + + @webmethod(route="/eval/benchmarks", method="POST") + async def register_benchmark( + self, + benchmark_id: str, + dataset_id: str, + scoring_functions: List[str], + provider_benchmark_id: Optional[str] = None, + provider_id: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> None: ... + + @webmethod(route="/eval-tasks", method="GET") + async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse: ... + + @webmethod(route="/eval-tasks/{task_id}", method="GET") + async def DEPRECATED_get_eval_task( + self, + task_id: str, + ) -> Optional[Benchmark]: ... + + @webmethod(route="/eval-tasks", method="POST") + async def DEPRECATED_register_eval_task( + self, + task_id: str, + dataset_id: str, + scoring_functions: List[str], + provider_benchmark_id: Optional[str] = None, + provider_id: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> None: ... From 234fe36d62366569ed4913ba6405401f92976740 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Feb 2025 21:38:40 -0800 Subject: [PATCH 16/31] fix cli download --- docs/_static/llama-stack-spec.html | 8 ++++---- docs/_static/llama-stack-spec.yaml | 8 ++++---- llama_stack/apis/benchmarks/benchmarks.py | 4 ++-- llama_stack/apis/eval/eval.py | 1 + llama_stack/cli/download.py | 14 +++++++------- llama_stack/cli/verify_download.py | 4 ++-- .../distribution/routers/routing_tables.py | 8 ++++---- .../inline/eval/meta_reference/eval.py | 2 +- llama_stack/providers/tests/eval/test_eval.py | 18 +++++++++--------- 9 files changed, 34 insertions(+), 33 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index b4506f5d5d..ea7a8f2100 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -108,8 +108,8 @@ "description": "", "parameters": [ { - "name": "task_id", - "in": "path", + "name": "eval_task_id", + "in": "query", "required": true, "schema": { "type": "string" @@ -3726,7 +3726,7 @@ "DeprecatedRegisterEvalTaskRequest": { "type": "object", "properties": { - "task_id": { + "eval_task_id": { "type": "string" }, "dataset_id": { @@ -3772,7 +3772,7 @@ }, "additionalProperties": false, "required": [ - "task_id", + "eval_task_id", "dataset_id", "scoring_functions" ] diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 6f655939eb..19c646bf98 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -50,8 +50,8 @@ paths: - Benchmarks description: '' parameters: - - name: task_id - in: path + - name: eval_task_id + in: query required: true schema: type: string @@ -2248,7 +2248,7 @@ components: DeprecatedRegisterEvalTaskRequest: type: object properties: - task_id: + eval_task_id: type: string dataset_id: type: string @@ -2272,7 +2272,7 @@ components: - type: object additionalProperties: false required: - - task_id + - eval_task_id - dataset_id - scoring_functions DeprecatedRunEvalRequest: diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py index 75f2b3d053..50019b18c7 100644 --- a/llama_stack/apis/benchmarks/benchmarks.py +++ b/llama_stack/apis/benchmarks/benchmarks.py @@ -71,13 +71,13 @@ async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse: ... @webmethod(route="/eval-tasks/{task_id}", method="GET") async def DEPRECATED_get_eval_task( self, - task_id: str, + eval_task_id: str, ) -> Optional[Benchmark]: ... @webmethod(route="/eval-tasks", method="POST") async def DEPRECATED_register_eval_task( self, - task_id: str, + eval_task_id: str, dataset_id: str, scoring_functions: List[str], provider_benchmark_id: Optional[str] = None, diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py index 010189cc7c..e5c7821503 100644 --- a/llama_stack/apis/eval/eval.py +++ b/llama_stack/apis/eval/eval.py @@ -39,6 +39,7 @@ class AgentCandidate(BaseModel): @json_schema_type class BenchmarkConfig(BaseModel): + type: Literal["benchmark"] = "benchmark" eval_candidate: EvalCandidate scoring_params: Dict[str, ScoringFnParams] = Field( description="Map between scoring function id and parameters for each scoring function you want to run", diff --git a/llama_stack/cli/download.py b/llama_stack/cli/download.py index 7b9b303f48..379ac49caa 100644 --- a/llama_stack/cli/download.py +++ b/llama_stack/cli/download.py @@ -105,7 +105,7 @@ class DownloadTask: output_file: str total_size: int = 0 downloaded_size: int = 0 - benchmark_id: Optional[int] = None + task_id: Optional[int] = None retries: int = 0 max_retries: int = 3 @@ -183,8 +183,8 @@ async def _get_info(): ) # Update the progress bar's total size once we know it - if task.benchmark_id is not None: - self.progress.update(task.benchmark_id, total=task.total_size) + if task.task_id is not None: + self.progress.update(task.task_id, total=task.total_size) except httpx.HTTPError as e: self.console.print(f"[red]Error getting file info: {str(e)}[/red]") @@ -207,7 +207,7 @@ async def _download_chunk(): file.write(chunk) task.downloaded_size += len(chunk) self.progress.update( - task.benchmark_id, + task.task_id, completed=task.downloaded_size, ) @@ -234,7 +234,7 @@ async def download_file(self, task: DownloadTask) -> None: if os.path.exists(task.output_file): if self.verify_file_integrity(task): self.console.print(f"[green]Already downloaded {task.output_file}[/green]") - self.progress.update(task.benchmark_id, completed=task.total_size) + self.progress.update(task.task_id, completed=task.total_size) return await self.prepare_download(task) @@ -258,7 +258,7 @@ async def download_file(self, task: DownloadTask) -> None: raise DownloadError(f"Download failed: {str(e)}") from e except Exception as e: - self.progress.update(task.benchmark_id, description=f"[red]Failed: {task.output_file}[/red]") + self.progress.update(task.task_id, description=f"[red]Failed: {task.output_file}[/red]") raise DownloadError(f"Download failed for {task.output_file}: {str(e)}") from e def has_disk_space(self, tasks: List[DownloadTask]) -> bool: @@ -293,7 +293,7 @@ async def download_all(self, tasks: List[DownloadTask]) -> None: with self.progress: for task in tasks: desc = f"Downloading {Path(task.output_file).name}" - task.benchmark_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size) + task.task_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size) semaphore = asyncio.Semaphore(self.max_concurrent_downloads) diff --git a/llama_stack/cli/verify_download.py b/llama_stack/cli/verify_download.py index ca72ca5818..47993c3613 100644 --- a/llama_stack/cli/verify_download.py +++ b/llama_stack/cli/verify_download.py @@ -82,7 +82,7 @@ def verify_files(model_dir: Path, checksums: Dict[str, str], console: Console) - ) as progress: for filepath, expected_hash in checksums.items(): full_path = model_dir / filepath - benchmark_id = progress.add_task(f"Verifying {filepath}...", total=None) + task_id = progress.add_task(f"Verifying {filepath}...", total=None) exists = full_path.exists() actual_hash = None @@ -102,7 +102,7 @@ def verify_files(model_dir: Path, checksums: Dict[str, str], console: Console) - ) ) - progress.remove_task(benchmark_id) + progress.remove_task(task_id) return results diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index b16becf1a9..99c73986ce 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -475,14 +475,14 @@ async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse: async def DEPRECATED_get_eval_task( self, - task_id: str, + eval_task_id: str, ) -> Optional[Benchmark]: logger.warning("DEPRECATED: Use /eval/benchmarks instead") - return await self.get_benchmark(task_id) + return await self.get_benchmark(eval_task_id) async def DEPRECATED_register_eval_task( self, - task_id: str, + eval_task_id: str, dataset_id: str, scoring_functions: List[str], provider_benchmark_id: Optional[str] = None, @@ -491,7 +491,7 @@ async def DEPRECATED_register_eval_task( ) -> None: logger.warning("DEPRECATED: Use /eval/benchmarks instead") return await self.register_benchmark( - benchmark_id=task_id, + benchmark_id=eval_task_id, dataset_id=dataset_id, scoring_functions=scoring_functions, metadata=metadata, diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py index 3ae530a47a..ea2acd7bbc 100644 --- a/llama_stack/providers/inline/eval/meta_reference/eval.py +++ b/llama_stack/providers/inline/eval/meta_reference/eval.py @@ -205,7 +205,7 @@ async def evaluate_rows( # scoring with generated_answer score_input_rows = [input_r | generated_r for input_r, generated_r in zip(input_rows, generations)] - if task_config.type == "app" and task_config.scoring_params is not None: + if task_config.scoring_params is not None: scoring_functions_dict = { scoring_fn_id: task_config.scoring_params.get(scoring_fn_id, None) for scoring_fn_id in scoring_functions diff --git a/llama_stack/providers/tests/eval/test_eval.py b/llama_stack/providers/tests/eval/test_eval.py index 78351a28ef..c2f351aa83 100644 --- a/llama_stack/providers/tests/eval/test_eval.py +++ b/llama_stack/providers/tests/eval/test_eval.py @@ -59,14 +59,14 @@ async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model scoring_functions = [ "basic::equality", ] - task_id = "meta-reference::app_eval" + benchmark_id = "meta-reference::app_eval" await benchmarks_impl.register_benchmark( - benchmark_id=task_id, + benchmark_id=benchmark_id, dataset_id="test_dataset_for_eval", scoring_functions=scoring_functions, ) response = await eval_impl.evaluate_rows( - task_id=task_id, + benchmark_id=benchmark_id, input_rows=rows.rows, scoring_functions=scoring_functions, task_config=AppBenchmarkConfig( @@ -105,14 +105,14 @@ async def test_eval_run_eval(self, eval_stack, inference_model, judge_model): "basic::subset_of", ] - task_id = "meta-reference::app_eval-2" + benchmark_id = "meta-reference::app_eval-2" await benchmarks_impl.register_benchmark( - benchmark_id=task_id, + benchmark_id=benchmark_id, dataset_id="test_dataset_for_eval", scoring_functions=scoring_functions, ) response = await eval_impl.run_eval( - task_id=task_id, + benchmark_id=benchmark_id, task_config=AppBenchmarkConfig( eval_candidate=ModelCandidate( model=inference_model, @@ -121,9 +121,9 @@ async def test_eval_run_eval(self, eval_stack, inference_model, judge_model): ), ) assert response.job_id == "0" - job_status = await eval_impl.job_status(task_id, response.job_id) + job_status = await eval_impl.job_status(benchmark_id, response.job_id) assert job_status and job_status.value == "completed" - eval_response = await eval_impl.job_result(task_id, response.job_id) + eval_response = await eval_impl.job_result(benchmark_id, response.job_id) assert eval_response is not None assert len(eval_response.generations) == 5 @@ -171,7 +171,7 @@ async def test_eval_run_benchmark_eval(self, eval_stack, inference_model): benchmark_id = "meta-reference-mmlu" response = await eval_impl.run_eval( - task_id=benchmark_id, + benchmark_id=benchmark_id, task_config=BenchmarkBenchmarkConfig( eval_candidate=ModelCandidate( model=inference_model, From 5f5a7b628f5aee9ce3a1559fd68ed67802f733d5 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Feb 2025 21:39:52 -0800 Subject: [PATCH 17/31] openapi --- docs/_static/llama-stack-spec.html | 171 +++++++++++++------------- docs/_static/llama-stack-spec.yaml | 185 +---------------------------- 2 files changed, 94 insertions(+), 262 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index ea7a8f2100..b93f6a380a 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -2699,8 +2699,7 @@ "auto", "required" ], - "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model.", - "default": "auto" + "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model." }, "tool_prompt_format": { "type": "string", @@ -2815,6 +2814,11 @@ "BenchmarkConfig": { "type": "object", "properties": { + "type": { + "type": "string", + "const": "benchmark", + "default": "benchmark" + }, "eval_candidate": { "$ref": "#/components/schemas/EvalCandidate" }, @@ -2830,6 +2834,7 @@ }, "additionalProperties": false, "required": [ + "type", "eval_candidate", "scoring_params" ] @@ -3498,17 +3503,7 @@ "ScoringResult": { "type": "object", "properties": { - "metrics": { - "type": "array", - "items": { - "$ref": "#/components/schemas/MetricEvent" - } - }, - "completion_message": { - "$ref": "#/components/schemas/CompletionMessage", - "description": "The complete response message" - }, - "logprobs": { + "score_rows": { "type": "array", "items": { "type": "object", @@ -3568,75 +3563,7 @@ "aggregated_results" ] }, - "MetricEvent": { - "type": "object", - "properties": { - "trace_id": { - "type": "string" - }, - "span_id": { - "type": "string" - }, - "timestamp": { - "type": "string", - "format": "date-time" - }, - "attributes": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "string" - }, - { - "type": "integer" - }, - { - "type": "number" - }, - { - "type": "boolean" - }, - { - "type": "null" - } - ] - } - }, - "type": { - "type": "string", - "const": "metric", - "default": "metric" - }, - "metric": { - "type": "string" - }, - "value": { - "oneOf": [ - { - "type": "integer" - }, - { - "type": "number" - } - ] - }, - "unit": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "trace_id", - "span_id", - "timestamp", - "type", - "metric", - "value", - "unit" - ] - }, - "TokenLogProbs": { + "Benchmark": { "type": "object", "properties": { "identifier": { @@ -4225,6 +4152,12 @@ "ChatCompletionResponse": { "type": "object", "properties": { + "metrics": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MetricEvent" + } + }, "completion_message": { "$ref": "#/components/schemas/CompletionMessage", "description": "The complete response message" @@ -4243,6 +4176,74 @@ ], "description": "Response from a chat completion request." }, + "MetricEvent": { + "type": "object", + "properties": { + "trace_id": { + "type": "string" + }, + "span_id": { + "type": "string" + }, + "timestamp": { + "type": "string", + "format": "date-time" + }, + "attributes": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + }, + { + "type": "null" + } + ] + } + }, + "type": { + "type": "string", + "const": "metric", + "default": "metric" + }, + "metric": { + "type": "string" + }, + "value": { + "oneOf": [ + { + "type": "integer" + }, + { + "type": "number" + } + ] + }, + "unit": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "trace_id", + "span_id", + "timestamp", + "type", + "metric", + "value", + "unit" + ] + }, "TokenLogProbs": { "type": "object", "properties": { @@ -4470,6 +4471,12 @@ "ChatCompletionResponseStreamChunk": { "type": "object", "properties": { + "metrics": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MetricEvent" + } + }, "event": { "$ref": "#/components/schemas/ChatCompletionResponseEvent", "description": "The event containing the new content" diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 19c646bf98..b30025020b 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -1641,7 +1641,6 @@ components: Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model. - default: auto tool_prompt_format: type: string enum: @@ -1713,6 +1712,10 @@ components: BenchmarkConfig: type: object properties: + type: + type: string + const: benchmark + default: benchmark eval_candidate: $ref: '#/components/schemas/EvalCandidate' scoring_params: @@ -1723,6 +1726,7 @@ components: type: integer additionalProperties: false required: + - type - eval_candidate - scoring_params EvalCandidate: @@ -2960,185 +2964,6 @@ components: - delta description: >- A chunk of a streamed completion response. - AgentConfig: - type: object - properties: - sampling_params: - $ref: '#/components/schemas/SamplingParams' - input_shields: - type: array - items: - type: string - output_shields: - type: array - items: - type: string - toolgroups: - type: array - items: - $ref: '#/components/schemas/AgentTool' - client_tools: - type: array - items: - $ref: '#/components/schemas/ToolDef' - tool_choice: - type: string - enum: - - auto - - required - description: >- - Whether tool use is required or automatic. This is a hint to the model - which may not be followed. It depends on the Instruction Following capabilities - of the model. - tool_prompt_format: - type: string - enum: - - json - - function_tag - - python_list - description: >- - Prompt format for calling custom / zero shot tools. - tool_config: - $ref: '#/components/schemas/ToolConfig' - max_infer_iters: - type: integer - default: 10 - model: - type: string - instructions: - type: string - enable_session_persistence: - type: boolean - response_format: - $ref: '#/components/schemas/ResponseFormat' - additionalProperties: false - required: - - model - - instructions - - enable_session_persistence - AgentTool: - oneOf: - - type: string - - type: object - properties: - name: - type: string - args: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - additionalProperties: false - required: - - name - - args - ToolDef: - type: object - properties: - type: - type: string - const: text - default: text - text: - type: string - additionalProperties: false - required: - - type - - text - ToolCallDelta: - type: object - properties: - type: - type: string - const: tool_call - default: tool_call - tool_call: - oneOf: - - type: string - - $ref: '#/components/schemas/ToolCall' - parse_status: - type: string - enum: - - started - - in_progress - - failed - - succeeded - additionalProperties: false - required: - - type - - tool_call - - parse_status - CompletionRequest: - type: object - properties: - model_id: - type: string - description: >- - The identifier of the model to use. The model must be registered with - Llama Stack and available via the /models endpoint. - content: - $ref: '#/components/schemas/InterleavedContent' - description: The content to generate a completion for - sampling_params: - $ref: '#/components/schemas/SamplingParams' - description: >- - (Optional) Parameters to control the sampling strategy - response_format: - $ref: '#/components/schemas/ResponseFormat' - description: >- - (Optional) Grammar specification for guided (structured) decoding - stream: - type: boolean - description: >- - (Optional) If True, generate an SSE event stream of the response. Defaults - to False. - logprobs: - type: object - properties: - top_k: - type: integer - default: 0 - description: >- - How many tokens (for each position) to return log probabilities for. - additionalProperties: false - description: >- - (Optional) If specified, log probabilities for each token position will - be returned. - additionalProperties: false - required: - - model_id - - content - CompletionResponseStreamChunk: - type: object - properties: - delta: - type: string - description: >- - New content generated since last chunk. This can be one or more tokens. - stop_reason: - type: string - enum: - - end_of_turn - - end_of_message - - out_of_tokens - description: >- - Optional reason why generation stopped, if complete - logprobs: - type: array - items: - $ref: '#/components/schemas/TokenLogProbs' - description: >- - Optional log probabilities for generated tokens - additionalProperties: false - required: - - delta - description: >- - A chunk of a streamed completion response. CreateAgentRequest: type: object properties: From bd94769c7dde5048cc98735b449339571adba79e Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Thu, 13 Feb 2025 01:03:28 -0500 Subject: [PATCH 18/31] feat: support listing all for `llama stack list-providers` (#1056) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? Support listing all for `llama stack list-providers`. For ease of reading, sort the output rows by type. Before the change. ```  llama stack list-providers usage: llama stack list-providers [-h] {inference,safety,agents,vector_io,datasetio,scoring,eval,post_training,tool_runtime,telemetry} llama stack list-providers: error: the following arguments are required: api ``` After the change. ``` +---------------+----------------------------------+----------------------------------------------------------------------------------+ | API Type | Provider Type | PIP Package Dependencies | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | agents | inline::meta-reference | matplotlib,pillow,pandas,scikit-learn,aiosqlite,psycopg2-binary,redis | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | datasetio | inline::localfs | pandas | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | datasetio | remote::huggingface | datasets | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | eval | inline::meta-reference | | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | inference | inline::meta-reference | accelerate,blobfile,fairscale,torch,torchvision,transformers,zmq,lm-format- | | | | enforcer,sentence-transformers | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | inference | inline::meta-reference-quantized | accelerate,blobfile,fairscale,torch,torchvision,transformers,zmq,lm-format- | | | | enforcer,sentence-transformers,fbgemm-gpu,torchao==0.5.0 | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | inference | inline::sentence-transformers | sentence-transformers | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | inference | inline::vllm | vllm | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | inference | remote::bedrock | boto3 | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | inference | remote::cerebras | cerebras_cloud_sdk | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | inference | remote::databricks | openai | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | inference | remote::fireworks | fireworks-ai | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | inference | remote::groq | groq | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | inference | remote::hf::endpoint | huggingface_hub,aiohttp | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | inference | remote::hf::serverless | huggingface_hub,aiohttp | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | inference | remote::nvidia | openai | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | inference | remote::ollama | ollama,aiohttp | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | inference | remote::runpod | openai | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | inference | remote::sambanova | openai | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | inference | remote::tgi | huggingface_hub,aiohttp | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | inference | remote::together | together | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | inference | remote::vllm | openai | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | post_training | inline::torchtune | torch,torchtune==0.5.0,torchao==0.8.0,numpy | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | safety | inline::code-scanner | codeshield | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | safety | inline::llama-guard | | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | safety | inline::meta-reference | transformers,torch --index-url https://download.pytorch.org/whl/cpu | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | safety | inline::prompt-guard | transformers,torch --index-url https://download.pytorch.org/whl/cpu | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | safety | remote::bedrock | boto3 | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | scoring | inline::basic | | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | scoring | inline::braintrust | autoevals,openai | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | scoring | inline::llm-as-judge | | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | telemetry | inline::meta-reference | opentelemetry-sdk,opentelemetry-exporter-otlp-proto-http | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | tool_runtime | inline::code-interpreter | | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | tool_runtime | inline::rag-runtime | | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | tool_runtime | remote::bing-search | requests | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | tool_runtime | remote::brave-search | requests | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | tool_runtime | remote::model-context-protocol | mcp | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | tool_runtime | remote::tavily-search | requests | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | tool_runtime | remote::wolfram-alpha | requests | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | vector_io | inline::chromadb | blobfile,chardet,pypdf,tqdm,numpy,scikit- | | | | learn,scipy,nltk,sentencepiece,transformers,torch torchvision --index-url | | | | https://download.pytorch.org/whl/cpu,sentence-transformers --no-deps,chromadb | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | vector_io | inline::faiss | blobfile,chardet,pypdf,tqdm,numpy,scikit- | | | | learn,scipy,nltk,sentencepiece,transformers,torch torchvision --index-url | | | | https://download.pytorch.org/whl/cpu,sentence-transformers --no-deps,faiss-cpu | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | vector_io | inline::meta-reference | blobfile,chardet,pypdf,tqdm,numpy,scikit- | | | | learn,scipy,nltk,sentencepiece,transformers,torch torchvision --index-url | | | | https://download.pytorch.org/whl/cpu,sentence-transformers --no-deps,faiss-cpu | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | vector_io | remote::chromadb | blobfile,chardet,pypdf,tqdm,numpy,scikit- | | | | learn,scipy,nltk,sentencepiece,transformers,torch torchvision --index-url | | | | https://download.pytorch.org/whl/cpu,sentence-transformers --no-deps,chromadb- | | | | client | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | vector_io | remote::pgvector | blobfile,chardet,pypdf,tqdm,numpy,scikit- | | | | learn,scipy,nltk,sentencepiece,transformers,torch torchvision --index-url | | | | https://download.pytorch.org/whl/cpu,sentence-transformers --no- | | | | deps,psycopg2-binary | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | vector_io | remote::qdrant | blobfile,chardet,pypdf,tqdm,numpy,scikit- | | | | learn,scipy,nltk,sentencepiece,transformers,torch torchvision --index-url | | | | https://download.pytorch.org/whl/cpu,sentence-transformers --no-deps,qdrant- | | | | client | +---------------+----------------------------------+----------------------------------------------------------------------------------+ | vector_io | remote::weaviate | blobfile,chardet,pypdf,tqdm,numpy,scikit- | | | | learn,scipy,nltk,sentencepiece,transformers,torch torchvision --index-url | | | | https://download.pytorch.org/whl/cpu,sentence-transformers --no-deps,weaviate- | | | | client | +---------------+----------------------------------+----------------------------------------------------------------------------------+ ``` [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan Manually. [//]: # (## Documentation) Signed-off-by: Ihar Hrachyshka --- llama_stack/cli/stack/list_providers.py | 26 +++++++++++++++++++------ llama_stack/cli/table.py | 7 ++++++- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/llama_stack/cli/stack/list_providers.py b/llama_stack/cli/stack/list_providers.py index bd152c9800..bfe11aa2c7 100644 --- a/llama_stack/cli/stack/list_providers.py +++ b/llama_stack/cli/stack/list_providers.py @@ -21,15 +21,19 @@ def __init__(self, subparsers: argparse._SubParsersAction): self._add_arguments() self.parser.set_defaults(func=self._run_providers_list_cmd) - def _add_arguments(self): + @property + def providable_apis(self): from llama_stack.distribution.distribution import providable_apis - api_values = [api.value for api in providable_apis()] + return [api.value for api in providable_apis()] + + def _add_arguments(self): self.parser.add_argument( "api", type=str, - choices=api_values, - help="API to list providers for (one of: {})".format(api_values), + choices=self.providable_apis, + nargs="?", + help="API to list providers for. List all if not specified.", ) def _run_providers_list_cmd(self, args: argparse.Namespace) -> None: @@ -37,20 +41,29 @@ def _run_providers_list_cmd(self, args: argparse.Namespace) -> None: from llama_stack.distribution.distribution import Api, get_provider_registry all_providers = get_provider_registry() - providers_for_api = all_providers[Api(args.api)] + if args.api: + providers = [(args.api, all_providers[Api(args.api)])] + else: + providers = [(k.value, prov) for k, prov in all_providers.items()] + + providers = [p for api, p in providers if api in self.providable_apis] # eventually, this should query a registry at llama.meta.com/llamastack/distributions headers = [ + "API Type", "Provider Type", "PIP Package Dependencies", ] rows = [] - for spec in providers_for_api.values(): + + specs = [spec for p in providers for spec in p.values()] + for spec in specs: if spec.is_sample: continue rows.append( [ + spec.api.value, spec.provider_type, ",".join(spec.pip_packages), ] @@ -59,4 +72,5 @@ def _run_providers_list_cmd(self, args: argparse.Namespace) -> None: rows, headers, separate_rows=True, + sort_by=(0, 1), ) diff --git a/llama_stack/cli/table.py b/llama_stack/cli/table.py index 50f54852bc..847719f817 100644 --- a/llama_stack/cli/table.py +++ b/llama_stack/cli/table.py @@ -6,6 +6,7 @@ import re import textwrap +from typing import Iterable from termcolor import cprint @@ -39,11 +40,15 @@ def wrap(text, width): return "\n".join(lines) -def print_table(rows, headers=None, separate_rows: bool = False): +def print_table(rows, headers=None, separate_rows: bool = False, sort_by: Iterable[int] = tuple()): def itemlen(item): return max([len(line) for line in strip_ansi_colors(item).split("\n")]) rows = [[x or "" for x in row] for row in rows] + + if sort_by: + rows.sort(key=lambda x: tuple(x[i] for i in sort_by)) + if not headers: col_widths = [max(itemlen(item) for item in col) for col in zip(*rows)] else: From a5d21e6f23d1fa86fff753f752fbb90775142f8f Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Thu, 13 Feb 2025 10:57:30 -0500 Subject: [PATCH 19/31] docs: Mention convential commits format in CONTRIBUTING.md (#1075) # What does this PR do? This adds a note to ensure pull requests follow the conventional commits format, along with a link to that format, in CONTRIBUTING.md. One of the pull-request checks enforces PR titles that match this format, so it's good to be upfront about this expectation before a new developer opens a PR. Signed-off-by: Ben Browning --- CONTRIBUTING.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8028c194e5..6dc08b5c0a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -40,6 +40,7 @@ If you need help or guidance, comment on the issue. Issues that are extra friend 3. Ensure the test suite passes. 4. Make sure your code lints using `pre-commit`. 5. If you haven't already, complete the Contributor License Agreement ("CLA"). +6. Ensure your pull request follows the [conventional commits format](https://www.conventionalcommits.org/en/v1.0.0/). ## Contributor License Agreement ("CLA") In order to accept your pull request, we need you to submit a CLA. You only need From 06c732a008ae877c688b69532bfcead0e3b68a4c Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Thu, 13 Feb 2025 11:00:00 -0500 Subject: [PATCH 20/31] fix: logprobs support in remote-vllm provider (#1074) # What does this PR do? The remote-vllm provider was not passing logprobs options from CompletionRequest or ChatCompletionRequests through to the OpenAI client parameters. I manually verified this, as well as observed this provider failing `TestInference::test_completion_logprobs`. This was filed as issue #1073. This fixes that by passing the `logprobs.top_k` value through to the parameters we pass into the OpenAI client. Additionally, this fixes a bug in `test_text_inference.py` where it mistakenly assumed chunk.delta were of type `ContentDelta` for completion requests. The deltas are of type `ContentDelta` for chat completion requests, but for basic completion requests the deltas are of type string. This test was likely failing for other providers that did properly support logprobs because of this latter issue in the test, which was hit while fixing the above issue with the remote-vllm provider. (Closes #1073) ## Test Plan First, you need a vllm running. I ran one locally like this: ``` vllm serve meta-llama/Llama-3.2-3B-Instruct --port 8001 --enable-auto-tool-choice --tool-call-parser llama3_json ``` Next, run test_text_inference.py against this vllm using the remote vllm provider like this: ``` VLLM_URL="http://localhost:8001/v1" python -m pytest -s -v llama_stack/providers/tests/inference/test_text_inference.py --providers "inference=vllm_remote" ``` Before my change, the test failed with this error: ``` llama_stack/providers/tests/inference/test_text_inference.py:155: in test_completion_logprobs assert 1 <= len(response.logprobs) <= 5 E TypeError: object of type 'NoneType' has no len() ``` After my change, the test passes. [//]: # (## Documentation) Signed-off-by: Ben Browning --- llama_stack/providers/remote/inference/vllm/vllm.py | 3 +++ llama_stack/providers/tests/inference/test_text_inference.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index 02594891be..3574768b51 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -345,6 +345,9 @@ async def _get_params(self, request: Union[ChatCompletionRequest, CompletionRequ else: raise ValueError(f"Unknown response format {fmt.type}") + if request.logprobs and request.logprobs.top_k: + input_dict["logprobs"] = request.logprobs.top_k + return { "model": request.model, **input_dict, diff --git a/llama_stack/providers/tests/inference/test_text_inference.py b/llama_stack/providers/tests/inference/test_text_inference.py index 99f968cbc2..6a72591238 100644 --- a/llama_stack/providers/tests/inference/test_text_inference.py +++ b/llama_stack/providers/tests/inference/test_text_inference.py @@ -175,7 +175,7 @@ async def test_completion_logprobs(self, inference_model, inference_stack): 1 <= len(chunks) <= 6 ) # why 6 and not 5? the response may have an extra closing chunk, e.g. for usage or stop_reason for chunk in chunks: - if chunk.delta.type == "text" and chunk.delta.text: # if there's a token, we expect logprobs + if chunk.delta: # if there's a token, we expect logprobs assert chunk.logprobs, "Logprobs should not be empty" assert all(len(logprob.logprobs_by_token) == 3 for logprob in chunk.logprobs) else: # no token, no logprobs From 40468aaa14f2a685db9d80d7ef0cd4afd5e7c832 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Thu, 13 Feb 2025 17:07:59 +0100 Subject: [PATCH 21/31] fix: improve signal handling and update dependencies (#1044) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? This commit enhances the signal handling mechanism in the server by improving the `handle_signal` (previously handle_sigint) function. It now properly retrieves the signal name, ensuring clearer logging when a termination signal is received. Additionally, it cancels all running tasks and waits for their completion before stopping the event loop, allowing for a more graceful shutdown. Support for handling SIGTERM has also been added alongside SIGINT. Before the changes, handle_sigint used asyncio.run(run_shutdown()). However, asyncio.run() is meant to start a new event loop, and calling it inside an existing one (like when running Uvicorn) raises an error. The fix replaces asyncio.run(run_shutdown()) with an async function scheduled on the existing loop using loop.create_task(shutdown()). This ensures that the shutdown coroutine runs within the current event loop instead of trying to create a new one. Furthermore, this commit updates the project dependencies. `fastapi` and `uvicorn` have been added to the development dependencies in `pyproject.toml` and `uv.lock`, ensuring that the necessary packages are available for development and execution. Closes: https://github.com/meta-llama/llama-stack/issues/1043 Signed-off-by: Sébastien Han [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan Run a server and send SIGINT: ``` INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" python -m llama_stack.distribution.server.server --yaml-config ./llama_stack/templates/ollama/run.yaml Using config file: llama_stack/templates/ollama/run.yaml Run configuration: apis: - agents - datasetio - eval - inference - safety - scoring - telemetry - tool_runtime - vector_io container_image: null datasets: [] eval_tasks: [] image_name: ollama metadata_store: db_path: /Users/leseb/.llama/distributions/ollama/registry.db namespace: null type: sqlite models: - metadata: {} model_id: meta-llama/Llama-3.2-3B-Instruct model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType - llm provider_id: ollama provider_model_id: null - metadata: embedding_dimension: 384 model_id: all-MiniLM-L6-v2 model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType - embedding provider_id: sentence-transformers provider_model_id: null providers: agents: - config: persistence_store: db_path: /Users/leseb/.llama/distributions/ollama/agents_store.db namespace: null type: sqlite provider_id: meta-reference provider_type: inline::meta-reference datasetio: - config: {} provider_id: huggingface provider_type: remote::huggingface - config: {} provider_id: localfs provider_type: inline::localfs eval: - config: {} provider_id: meta-reference provider_type: inline::meta-reference inference: - config: url: http://localhost:11434 provider_id: ollama provider_type: remote::ollama - config: {} provider_id: sentence-transformers provider_type: inline::sentence-transformers safety: - config: {} provider_id: llama-guard provider_type: inline::llama-guard scoring: - config: {} provider_id: basic provider_type: inline::basic - config: {} provider_id: llm-as-judge provider_type: inline::llm-as-judge - config: openai_api_key: '********' provider_id: braintrust provider_type: inline::braintrust telemetry: - config: service_name: llama-stack sinks: console,sqlite sqlite_db_path: /Users/leseb/.llama/distributions/ollama/trace_store.db provider_id: meta-reference provider_type: inline::meta-reference tool_runtime: - config: api_key: '********' max_results: 3 provider_id: brave-search provider_type: remote::brave-search - config: api_key: '********' max_results: 3 provider_id: tavily-search provider_type: remote::tavily-search - config: {} provider_id: code-interpreter provider_type: inline::code-interpreter - config: {} provider_id: rag-runtime provider_type: inline::rag-runtime vector_io: - config: kvstore: db_path: /Users/leseb/.llama/distributions/ollama/faiss_store.db namespace: null type: sqlite provider_id: faiss provider_type: inline::faiss scoring_fns: [] server: port: 8321 tls_certfile: null tls_keyfile: null shields: [] tool_groups: - args: null mcp_endpoint: null provider_id: tavily-search toolgroup_id: builtin::websearch - args: null mcp_endpoint: null provider_id: rag-runtime toolgroup_id: builtin::rag - args: null mcp_endpoint: null provider_id: code-interpreter toolgroup_id: builtin::code_interpreter vector_dbs: [] version: '2' INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:213: Resolved 31 providers INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: inner-inference => ollama INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: inner-inference => sentence-transformers INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: models => __routing_table__ INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: inference => __autorouted__ INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: inner-vector_io => faiss INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: inner-safety => llama-guard INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: shields => __routing_table__ INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: safety => __autorouted__ INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: vector_dbs => __routing_table__ INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: vector_io => __autorouted__ INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: inner-tool_runtime => brave-search INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: inner-tool_runtime => tavily-search INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: inner-tool_runtime => code-interpreter INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: inner-tool_runtime => rag-runtime INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: tool_groups => __routing_table__ INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: tool_runtime => __autorouted__ INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: agents => meta-reference INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: inner-datasetio => huggingface INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: inner-datasetio => localfs INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: datasets => __routing_table__ INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: datasetio => __autorouted__ INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: telemetry => meta-reference INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: inner-scoring => basic INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: inner-scoring => llm-as-judge INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: inner-scoring => braintrust INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: scoring_functions => __routing_table__ INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: scoring => __autorouted__ INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: inner-eval => meta-reference INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: eval_tasks => __routing_table__ INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: eval => __autorouted__ INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:215: inspect => __builtin__ INFO 2025-02-12 10:21:03,540 llama_stack.distribution.resolver:216: INFO 2025-02-12 10:21:03,723 llama_stack.providers.remote.inference.ollama.ollama:148: checking connectivity to Ollama at `http://localhost:11434`... INFO 2025-02-12 10:21:03,734 httpx:1740: HTTP Request: GET http://localhost:11434/api/ps "HTTP/1.1 200 OK" INFO 2025-02-12 10:21:03,843 faiss.loader:148: Loading faiss. INFO 2025-02-12 10:21:03,865 faiss.loader:150: Successfully loaded faiss. INFO 2025-02-12 10:21:03,868 faiss:173: Failed to load GPU Faiss: name 'GpuIndexIVFFlat' is not defined. Will not load constructor refs for GPU indexes. Warning: `bwrap` is not available. Code interpreter tool will not work correctly. INFO 2025-02-12 10:21:04,315 datasets:54: PyTorch version 2.6.0 available. INFO 2025-02-12 10:21:04,556 httpx:1740: HTTP Request: GET http://localhost:11434/api/ps "HTTP/1.1 200 OK" INFO 2025-02-12 10:21:04,557 llama_stack.providers.utils.inference.embedding_mixin:42: Loading sentence transformer for all-MiniLM-L6-v2... INFO 2025-02-12 10:21:07,202 sentence_transformers.SentenceTransformer:210: Use pytorch device_name: mps INFO 2025-02-12 10:21:07,202 sentence_transformers.SentenceTransformer:218: Load pretrained SentenceTransformer: all-MiniLM-L6-v2 INFO 2025-02-12 10:21:09,500 llama_stack.distribution.stack:102: Models: all-MiniLM-L6-v2 served by sentence-transformers INFO 2025-02-12 10:21:09,500 llama_stack.distribution.stack:102: Models: meta-llama/Llama-3.2-3B-Instruct served by ollama INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: basic::equality served by basic INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: basic::regex_parser_multiple_choice_answer served by basic INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: basic::subset_of served by basic INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::answer-correctness served by braintrust INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::answer-relevancy served by braintrust INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::answer-similarity served by braintrust INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::context-entity-recall served by braintrust INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::context-precision served by braintrust INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::context-recall served by braintrust INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::context-relevancy served by braintrust INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::factuality served by braintrust INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: braintrust::faithfulness served by braintrust INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: llm-as-judge::405b-simpleqa served by llm-as-judge INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Scoring_fns: llm-as-judge::base served by llm-as-judge INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Tool_groups: builtin::code_interpreter served by code-interpreter INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Tool_groups: builtin::rag served by rag-runtime INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:102: Tool_groups: builtin::websearch served by tavily-search INFO 2025-02-12 10:21:09,501 llama_stack.distribution.stack:106: Serving API eval POST /v1/eval/tasks/{task_id}/evaluations DELETE /v1/eval/tasks/{task_id}/jobs/{job_id} GET /v1/eval/tasks/{task_id}/jobs/{job_id}/result GET /v1/eval/tasks/{task_id}/jobs/{job_id} POST /v1/eval/tasks/{task_id}/jobs Serving API agents POST /v1/agents POST /v1/agents/{agent_id}/session POST /v1/agents/{agent_id}/session/{session_id}/turn DELETE /v1/agents/{agent_id} DELETE /v1/agents/{agent_id}/session/{session_id} GET /v1/agents/{agent_id}/session/{session_id} GET /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id} GET /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id} Serving API scoring_functions GET /v1/scoring-functions/{scoring_fn_id} GET /v1/scoring-functions POST /v1/scoring-functions Serving API safety POST /v1/safety/run-shield Serving API inspect GET /v1/health GET /v1/inspect/providers GET /v1/inspect/routes GET /v1/version Serving API tool_runtime POST /v1/tool-runtime/invoke GET /v1/tool-runtime/list-tools POST /v1/tool-runtime/rag-tool/insert POST /v1/tool-runtime/rag-tool/query Serving API datasetio POST /v1/datasetio/rows GET /v1/datasetio/rows Serving API shields GET /v1/shields/{identifier} GET /v1/shields POST /v1/shields Serving API eval_tasks GET /v1/eval-tasks/{eval_task_id} GET /v1/eval-tasks POST /v1/eval-tasks Serving API models GET /v1/models/{model_id} GET /v1/models POST /v1/models DELETE /v1/models/{model_id} Serving API datasets GET /v1/datasets/{dataset_id} GET /v1/datasets POST /v1/datasets DELETE /v1/datasets/{dataset_id} Serving API vector_io POST /v1/vector-io/insert POST /v1/vector-io/query Serving API inference POST /v1/inference/chat-completion POST /v1/inference/completion POST /v1/inference/embeddings Serving API tool_groups GET /v1/tools/{tool_name} GET /v1/toolgroups/{toolgroup_id} GET /v1/toolgroups GET /v1/tools POST /v1/toolgroups DELETE /v1/toolgroups/{toolgroup_id} Serving API vector_dbs GET /v1/vector-dbs/{vector_db_id} GET /v1/vector-dbs POST /v1/vector-dbs DELETE /v1/vector-dbs/{vector_db_id} Serving API scoring POST /v1/scoring/score POST /v1/scoring/score-batch Serving API telemetry GET /v1/telemetry/traces/{trace_id}/spans/{span_id} GET /v1/telemetry/spans/{span_id}/tree GET /v1/telemetry/traces/{trace_id} POST /v1/telemetry/events GET /v1/telemetry/spans GET /v1/telemetry/traces POST /v1/telemetry/spans/export Listening on ['::', '0.0.0.0']:5001 INFO: Started server process [65372] INFO: Waiting for application startup. INFO: ASGI 'lifespan' protocol appears unsupported. INFO: Application startup complete. INFO: Uvicorn running on http://['::', '0.0.0.0']:5001 (Press CTRL+C to quit) ^CINFO: Shutting down INFO: Finished server process [65372] Received signal SIGINT (2). Exiting gracefully... INFO 2025-02-12 10:21:11,215 __main__:151: Shutting down ModelsRoutingTable INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down InferenceRouter INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down ShieldsRoutingTable INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down SafetyRouter INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down VectorDBsRoutingTable INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down VectorIORouter INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down ToolGroupsRoutingTable INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down ToolRuntimeRouter INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down MetaReferenceAgentsImpl INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down DatasetsRoutingTable INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down DatasetIORouter INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down TelemetryAdapter INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down ScoringFunctionsRoutingTable INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down ScoringRouter INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down EvalTasksRoutingTable INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down EvalRouter INFO 2025-02-12 10:21:11,216 __main__:151: Shutting down DistributionInspectImpl ``` [//]: # (## Documentation) [//]: # (- [ ] Added a Changelog entry if the change is significant) Signed-off-by: Sébastien Han --- llama_stack/distribution/inspect.py | 3 + .../distribution/routers/routing_tables.py | 3 + llama_stack/distribution/server/server.py | 77 ++++++++++++++++--- .../inline/agents/meta_reference/agents.py | 3 + pyproject.toml | 2 + uv.lock | 18 +++++ 6 files changed, 94 insertions(+), 12 deletions(-) diff --git a/llama_stack/distribution/inspect.py b/llama_stack/distribution/inspect.py index b7ee4a219f..fddb625701 100644 --- a/llama_stack/distribution/inspect.py +++ b/llama_stack/distribution/inspect.py @@ -82,3 +82,6 @@ async def health(self) -> HealthInfo: async def version(self) -> VersionInfo: return VersionInfo(version=version("llama-stack")) + + async def shutdown(self) -> None: + pass diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index 99c73986ce..ec258af491 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -570,3 +570,6 @@ async def unregister_toolgroup(self, toolgroup_id: str) -> None: for tool in tools: await self.unregister_object(tool) await self.unregister_object(tool_group) + + async def shutdown(self) -> None: + pass diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py index d2c32de119..bb735268b6 100644 --- a/llama_stack/distribution/server/server.py +++ b/llama_stack/distribution/server/server.py @@ -7,6 +7,7 @@ import argparse import asyncio import functools +import logging import inspect import json import os @@ -52,6 +53,9 @@ REPO_ROOT = Path(__file__).parent.parent.parent.parent +logging.basicConfig(level=logging.INFO, format="%(levelname)s %(asctime)s %(name)s:%(lineno)d: %(message)s") +logger = logging.getLogger(__name__) + def warn_with_traceback(message, category, filename, lineno, file=None, line=None): log = file if hasattr(file, "write") else sys.stderr @@ -112,21 +116,69 @@ def translate_exception(exc: Exception) -> Union[HTTPException, RequestValidatio ) -def handle_sigint(app, *args, **kwargs): - print("SIGINT or CTRL-C detected. Exiting gracefully...") +def handle_signal(app, signum, _) -> None: + """ + Handle incoming signals and initiate a graceful shutdown of the application. + + This function is intended to be used as a signal handler for various signals + (e.g., SIGINT, SIGTERM). Upon receiving a signal, it will print a message + indicating the received signal and initiate a shutdown process. - async def run_shutdown(): - for impl in app.__llama_stack_impls__.values(): - print(f"Shutting down {impl}") - await impl.shutdown() + Args: + app: The application instance containing implementations to be shut down. + signum (int): The signal number received. + frame: The current stack frame (not used in this function). - asyncio.run(run_shutdown()) + The shutdown process involves: + - Shutting down all implementations registered in the application. + - Gathering all running asyncio tasks. + - Cancelling all gathered tasks. + - Waiting for all tasks to finish. + - Stopping the event loop. - loop = asyncio.get_event_loop() - for task in asyncio.all_tasks(loop): - task.cancel() + Note: + This function schedules the shutdown process as an asyncio task and does + not block the current execution. + """ + signame = signal.Signals(signum).name + print(f"Received signal {signame} ({signum}). Exiting gracefully...") + + async def shutdown(): + try: + # Gracefully shut down implementations + for impl in app.__llama_stack_impls__.values(): + impl_name = impl.__class__.__name__ + logger.info("Shutting down %s", impl_name) + try: + if hasattr(impl, "shutdown"): + await asyncio.wait_for(impl.shutdown(), timeout=5) + else: + logger.warning("No shutdown method for %s", impl_name) + except asyncio.TimeoutError: + logger.exception("Shutdown timeout for %s ", impl_name, exc_info=True) + except Exception as e: + logger.exception("Failed to shutdown %s: %s", impl_name, {e}) + + # Gather all running tasks + loop = asyncio.get_running_loop() + tasks = [task for task in asyncio.all_tasks(loop) if task is not asyncio.current_task()] + + # Cancel all tasks + for task in tasks: + task.cancel() + + # Wait for all tasks to finish + try: + await asyncio.wait_for(asyncio.gather(*tasks, return_exceptions=True), timeout=10) + except asyncio.TimeoutError: + logger.exception("Timeout while waiting for tasks to finish") + except asyncio.CancelledError: + pass + finally: + loop.stop() - loop.stop() + loop = asyncio.get_running_loop() + loop.create_task(shutdown()) @asynccontextmanager @@ -386,7 +438,8 @@ def main(): print("") app.exception_handler(RequestValidationError)(global_exception_handler) app.exception_handler(Exception)(global_exception_handler) - signal.signal(signal.SIGINT, functools.partial(handle_sigint, app)) + signal.signal(signal.SIGINT, functools.partial(handle_signal, app)) + signal.signal(signal.SIGTERM, functools.partial(handle_signal, app)) app.__llama_stack_impls__ = impls diff --git a/llama_stack/providers/inline/agents/meta_reference/agents.py b/llama_stack/providers/inline/agents/meta_reference/agents.py index fe4ccd1a33..e3c18d1122 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agents.py +++ b/llama_stack/providers/inline/agents/meta_reference/agents.py @@ -212,3 +212,6 @@ async def delete_agents_session(self, agent_id: str, session_id: str) -> None: async def delete_agent(self, agent_id: str) -> None: await self.persistence_store.delete(f"agent:{agent_id}") + + async def shutdown(self) -> None: + pass diff --git a/pyproject.toml b/pyproject.toml index 5e9cb75e25..2f40ceac9f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,8 @@ dev = [ "types-requests", "types-setuptools", "pre-commit", + "uvicorn", + "fastapi", ] docs = [ "sphinx-autobuild", diff --git a/uv.lock b/uv.lock index 087396eeac..97ae521244 100644 --- a/uv.lock +++ b/uv.lock @@ -431,6 +431,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7b/8f/c4d9bafc34ad7ad5d8dc16dd1347ee0e507a52c3adb6bfa8887e1c6a26ba/executing-2.2.0-py2.py3-none-any.whl", hash = "sha256:11387150cad388d62750327a53d3339fad4888b39a6fe233c3afbb54ecffd3aa", size = 26702 }, ] +[[package]] +name = "fastapi" +version = "0.115.8" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pydantic" }, + { name = "starlette" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a2/b2/5a5dc4affdb6661dea100324e19a7721d5dc524b464fe8e366c093fd7d87/fastapi-0.115.8.tar.gz", hash = "sha256:0ce9111231720190473e222cdf0f07f7206ad7e53ea02beb1d2dc36e2f0741e9", size = 295403 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8f/7d/2d6ce181d7a5f51dedb8c06206cbf0ec026a99bf145edd309f9e17c3282f/fastapi-0.115.8-py3-none-any.whl", hash = "sha256:753a96dd7e036b34eeef8babdfcfe3f28ff79648f86551eb36bfc1b0bf4a8cbf", size = 94814 }, +] + [[package]] name = "fastjsonschema" version = "2.21.1" @@ -724,6 +738,7 @@ dependencies = [ [package.optional-dependencies] dev = [ { name = "black" }, + { name = "fastapi" }, { name = "nbval" }, { name = "pre-commit" }, { name = "pytest" }, @@ -731,6 +746,7 @@ dev = [ { name = "ruff" }, { name = "types-requests" }, { name = "types-setuptools" }, + { name = "uvicorn" }, ] docs = [ { name = "myst-parser" }, @@ -748,6 +764,7 @@ docs = [ requires-dist = [ { name = "black", marker = "extra == 'dev'" }, { name = "blobfile" }, + { name = "fastapi", marker = "extra == 'dev'" }, { name = "fire" }, { name = "httpx" }, { name = "huggingface-hub" }, @@ -776,6 +793,7 @@ requires-dist = [ { name = "termcolor" }, { name = "types-requests", marker = "extra == 'dev'" }, { name = "types-setuptools", marker = "extra == 'dev'" }, + { name = "uvicorn", marker = "extra == 'dev'" }, ] [[package]] From b56465738dc8433b9131bf08956b23381aa83e3d Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Fri, 14 Feb 2025 00:33:11 +0800 Subject: [PATCH 22/31] style: update model id in model list title (#1072) # What does this PR do? [Provide a short summary of what this PR does and why. Link to relevant issues if applicable.] Since the subcommands used `MODEL_ID`, it would be better to use it in `model list` and make it easy to find it. ``` $ llama model verify-download --help usage: llama model verify-download [-h] --model-id MODEL_ID << $ llama model describe --help usage: llama model describe [-h] -m MODEL_ID << $ llama download --help --model-id MODEL_ID See `llama model list` or `llama model list --show-all` for the list of available models before: $ llama model list +-----------------------------------------+-----------------------------------------------------+----------------+ | Model Descriptor | Hugging Face Repo | Context Length | +-----------------------------------------+-----------------------------------------------------+----------------+ after: $ llama model list +-----------------------------------------+-----------------------------------------------------+----------------+ | Model Descriptor | Model ID | Context Length | +-----------------------------------------+-----------------------------------------------------+----------------+ | Llama3.1-8B | meta-llama/Llama-3.1-8B | 128K | +-----------------------------------------+-----------------------------------------------------+----------------+ ``` [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan [Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.*] [//]: # (## Documentation) Signed-off-by: reidliu Co-authored-by: reidliu --- llama_stack/cli/model/list.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_stack/cli/model/list.py b/llama_stack/cli/model/list.py index 6d296e75e1..9b5ebb1a5f 100644 --- a/llama_stack/cli/model/list.py +++ b/llama_stack/cli/model/list.py @@ -38,7 +38,7 @@ def _run_model_list_cmd(self, args: argparse.Namespace) -> None: headers = [ "Model Descriptor", - "Hugging Face Repo", + "Model ID", "Context Length", ] From b8a612e5bd83e9e367cef7e5b1f95849d0984f3e Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Thu, 13 Feb 2025 09:50:38 -0800 Subject: [PATCH 23/31] update --- docs/openapi_generator/pyopenapi/generator.py | 2 +- .../openapi_generator/pyopenapi/operations.py | 2 ++ llama_stack/apis/agents/agents.py | 19 +++++++++++-------- llama_stack/apis/datasets/datasets.py | 4 ++-- llama_stack/apis/models/models.py | 4 ++-- .../scoring_functions/scoring_functions.py | 2 +- llama_stack/apis/shields/shields.py | 2 +- llama_stack/apis/telemetry/telemetry.py | 8 ++++---- llama_stack/apis/tools/tools.py | 6 +++--- llama_stack/apis/vector_dbs/vector_dbs.py | 4 ++-- 10 files changed, 29 insertions(+), 24 deletions(-) diff --git a/docs/openapi_generator/pyopenapi/generator.py b/docs/openapi_generator/pyopenapi/generator.py index 86db8a06d9..e37c45690c 100644 --- a/docs/openapi_generator/pyopenapi/generator.py +++ b/docs/openapi_generator/pyopenapi/generator.py @@ -647,7 +647,6 @@ def _build_operation(self, op: EndpointOperation) -> Operation: description = "\n".join( filter(None, [doc_string.short_description, doc_string.long_description]) ) - return Operation( tags=[op.defining_class.__name__], summary=None, @@ -685,6 +684,7 @@ def generate(self) -> Document: raise NotImplementedError(f"unknown HTTP method: {op.http_method}") route = op.get_route() + route = route.replace(":path", "") print(f"route: {route}") if route in paths: paths[route].update(pathItem) diff --git a/docs/openapi_generator/pyopenapi/operations.py b/docs/openapi_generator/pyopenapi/operations.py index abeb169366..bf4d35c87f 100644 --- a/docs/openapi_generator/pyopenapi/operations.py +++ b/docs/openapi_generator/pyopenapi/operations.py @@ -130,6 +130,8 @@ def __getitem__(self, key: str) -> None: def _get_route_parameters(route: str) -> List[str]: extractor = _FormatParameterExtractor() + # Replace all occurrences of ":path" with empty string + route = route.replace(":path", "") route.format_map(extractor) return extractor.keys diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py index 785248633f..b20145be96 100644 --- a/llama_stack/apis/agents/agents.py +++ b/llama_stack/apis/agents/agents.py @@ -29,11 +29,11 @@ SamplingParams, ToolCall, ToolChoice, + ToolConfig, ToolPromptFormat, ToolResponse, ToolResponseMessage, UserMessage, - ToolConfig, ) from llama_stack.apis.safety import SafetyViolation from llama_stack.apis.tools import ToolDef @@ -318,7 +318,7 @@ async def create_agent( agent_config: AgentConfig, ) -> AgentCreateResponse: ... - @webmethod(route="/agents/{agent_id}/session/{session_id}/turn", method="POST") + @webmethod(route="/agents/{agent_id:path}/session/{session_id:path}/turn", method="POST") async def create_agent_turn( self, agent_id: str, @@ -335,7 +335,10 @@ async def create_agent_turn( tool_config: Optional[ToolConfig] = None, ) -> Union[Turn, AsyncIterator[AgentTurnResponseStreamChunk]]: ... - @webmethod(route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}", method="GET") + @webmethod( + route="/agents/{agent_id:path}/session/{session_id:path}/turn/{turn_id:path}", + method="GET", + ) async def get_agents_turn( self, agent_id: str, @@ -344,7 +347,7 @@ async def get_agents_turn( ) -> Turn: ... @webmethod( - route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}", + route="/agents/{agent_id:path}/session/{session_id:path}/turn/{turn_id:path}/step/{step_id:path}", method="GET", ) async def get_agents_step( @@ -355,14 +358,14 @@ async def get_agents_step( step_id: str, ) -> AgentStepResponse: ... - @webmethod(route="/agents/{agent_id}/session", method="POST") + @webmethod(route="/agents/{agent_id:path}/session", method="POST") async def create_agent_session( self, agent_id: str, session_name: str, ) -> AgentSessionCreateResponse: ... - @webmethod(route="/agents/{agent_id}/session/{session_id}", method="GET") + @webmethod(route="/agents/{agent_id:path}/session/{session_id:path}", method="GET") async def get_agents_session( self, session_id: str, @@ -370,14 +373,14 @@ async def get_agents_session( turn_ids: Optional[List[str]] = None, ) -> Session: ... - @webmethod(route="/agents/{agent_id}/session/{session_id}", method="DELETE") + @webmethod(route="/agents/{agent_id:path}/session/{session_id:path}", method="DELETE") async def delete_agents_session( self, session_id: str, agent_id: str, ) -> None: ... - @webmethod(route="/agents/{agent_id}", method="DELETE") + @webmethod(route="/agents/{agent_id:path}", method="DELETE") async def delete_agent( self, agent_id: str, diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py index 5ad5bdcdb9..5e2b38697d 100644 --- a/llama_stack/apis/datasets/datasets.py +++ b/llama_stack/apis/datasets/datasets.py @@ -58,7 +58,7 @@ async def register_dataset( metadata: Optional[Dict[str, Any]] = None, ) -> None: ... - @webmethod(route="/datasets/{dataset_id}", method="GET") + @webmethod(route="/datasets/{dataset_id:path}", method="GET") async def get_dataset( self, dataset_id: str, @@ -67,7 +67,7 @@ async def get_dataset( @webmethod(route="/datasets", method="GET") async def list_datasets(self) -> ListDatasetsResponse: ... - @webmethod(route="/datasets/{dataset_id}", method="DELETE") + @webmethod(route="/datasets/{dataset_id:path}", method="DELETE") async def unregister_dataset( self, dataset_id: str, diff --git a/llama_stack/apis/models/models.py b/llama_stack/apis/models/models.py index 3361c2836e..7e6d9854fa 100644 --- a/llama_stack/apis/models/models.py +++ b/llama_stack/apis/models/models.py @@ -62,7 +62,7 @@ class Models(Protocol): @webmethod(route="/models", method="GET") async def list_models(self) -> ListModelsResponse: ... - @webmethod(route="/models/{model_id}", method="GET") + @webmethod(route="/models/{model_id:path}", method="GET") async def get_model( self, model_id: str, @@ -78,7 +78,7 @@ async def register_model( model_type: Optional[ModelType] = None, ) -> Model: ... - @webmethod(route="/models/{model_id}", method="DELETE") + @webmethod(route="/models/{model_id:path}", method="DELETE") async def unregister_model( self, model_id: str, diff --git a/llama_stack/apis/scoring_functions/scoring_functions.py b/llama_stack/apis/scoring_functions/scoring_functions.py index 3259795832..3fa40ffbfe 100644 --- a/llama_stack/apis/scoring_functions/scoring_functions.py +++ b/llama_stack/apis/scoring_functions/scoring_functions.py @@ -134,7 +134,7 @@ class ScoringFunctions(Protocol): @webmethod(route="/scoring-functions", method="GET") async def list_scoring_functions(self) -> ListScoringFunctionsResponse: ... - @webmethod(route="/scoring-functions/{scoring_fn_id}", method="GET") + @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="GET") async def get_scoring_function(self, scoring_fn_id: str, /) -> Optional[ScoringFn]: ... @webmethod(route="/scoring-functions", method="POST") diff --git a/llama_stack/apis/shields/shields.py b/llama_stack/apis/shields/shields.py index 3dd685b145..ae316ee536 100644 --- a/llama_stack/apis/shields/shields.py +++ b/llama_stack/apis/shields/shields.py @@ -48,7 +48,7 @@ class Shields(Protocol): @webmethod(route="/shields", method="GET") async def list_shields(self) -> ListShieldsResponse: ... - @webmethod(route="/shields/{identifier}", method="GET") + @webmethod(route="/shields/{identifier:path}", method="GET") async def get_shield(self, identifier: str) -> Optional[Shield]: ... @webmethod(route="/shields", method="POST") diff --git a/llama_stack/apis/telemetry/telemetry.py b/llama_stack/apis/telemetry/telemetry.py index 6272cc40b7..5622aaeac8 100644 --- a/llama_stack/apis/telemetry/telemetry.py +++ b/llama_stack/apis/telemetry/telemetry.py @@ -13,8 +13,8 @@ Literal, Optional, Protocol, - Union, runtime_checkable, + Union, ) from llama_models.llama3.api.datatypes import Primitive @@ -224,13 +224,13 @@ async def query_traces( order_by: Optional[List[str]] = None, ) -> QueryTracesResponse: ... - @webmethod(route="/telemetry/traces/{trace_id}", method="GET") + @webmethod(route="/telemetry/traces/{trace_id:path}", method="GET") async def get_trace(self, trace_id: str) -> Trace: ... - @webmethod(route="/telemetry/traces/{trace_id}/spans/{span_id}", method="GET") + @webmethod(route="/telemetry/traces/{trace_id:path}/spans/{span_id:path}", method="GET") async def get_span(self, trace_id: str, span_id: str) -> Span: ... - @webmethod(route="/telemetry/spans/{span_id}/tree", method="GET") + @webmethod(route="/telemetry/spans/{span_id:path}/tree", method="GET") async def get_span_tree( self, span_id: str, diff --git a/llama_stack/apis/tools/tools.py b/llama_stack/apis/tools/tools.py index d6d806c531..a8e946b082 100644 --- a/llama_stack/apis/tools/tools.py +++ b/llama_stack/apis/tools/tools.py @@ -101,7 +101,7 @@ async def register_tool_group( """Register a tool group""" ... - @webmethod(route="/toolgroups/{toolgroup_id}", method="GET") + @webmethod(route="/toolgroups/{toolgroup_id:path}", method="GET") async def get_tool_group( self, toolgroup_id: str, @@ -117,13 +117,13 @@ async def list_tools(self, toolgroup_id: Optional[str] = None) -> ListToolsRespo """List tools with optional tool group""" ... - @webmethod(route="/tools/{tool_name}", method="GET") + @webmethod(route="/tools/{tool_name:path}", method="GET") async def get_tool( self, tool_name: str, ) -> Tool: ... - @webmethod(route="/toolgroups/{toolgroup_id}", method="DELETE") + @webmethod(route="/toolgroups/{toolgroup_id:path}", method="DELETE") async def unregister_toolgroup( self, toolgroup_id: str, diff --git a/llama_stack/apis/vector_dbs/vector_dbs.py b/llama_stack/apis/vector_dbs/vector_dbs.py index 4b782e2d5b..1da2c128c2 100644 --- a/llama_stack/apis/vector_dbs/vector_dbs.py +++ b/llama_stack/apis/vector_dbs/vector_dbs.py @@ -46,7 +46,7 @@ class VectorDBs(Protocol): @webmethod(route="/vector-dbs", method="GET") async def list_vector_dbs(self) -> ListVectorDBsResponse: ... - @webmethod(route="/vector-dbs/{vector_db_id}", method="GET") + @webmethod(route="/vector-dbs/{vector_db_id:path}", method="GET") async def get_vector_db( self, vector_db_id: str, @@ -62,5 +62,5 @@ async def register_vector_db( provider_vector_db_id: Optional[str] = None, ) -> VectorDB: ... - @webmethod(route="/vector-dbs/{vector_db_id}", method="DELETE") + @webmethod(route="/vector-dbs/{vector_db_id:path}", method="DELETE") async def unregister_vector_db(self, vector_db_id: str) -> None: ... From 0e426d3cf8c7bc1f14dcfaf98b5212a80fdc0b1c Mon Sep 17 00:00:00 2001 From: Yuan Tang Date: Thu, 13 Feb 2025 12:14:57 -0500 Subject: [PATCH 24/31] chore: Link to Groq docs in the warning message for preview model (#1060) This should be `llama-3.2-3b` instead of `llama-3.2-3b-instruct`. --- llama_stack/providers/remote/inference/groq/groq.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llama_stack/providers/remote/inference/groq/groq.py b/llama_stack/providers/remote/inference/groq/groq.py index 4e6cc2d6bd..9b3c1abbf2 100644 --- a/llama_stack/providers/remote/inference/groq/groq.py +++ b/llama_stack/providers/remote/inference/groq/groq.py @@ -108,6 +108,7 @@ async def chat_completion( "Groq only contains a preview version for llama-3.2-3b-instruct. " "Preview models aren't recommended for production use. " "They can be discontinued on short notice." + "More details: https://console.groq.com/docs/models" ) request = convert_chat_completion_request( From ceff63130d2c05874b5f8b9f0a5b67cad0cc164a Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 10 Feb 2025 11:21:51 -0800 Subject: [PATCH 25/31] deprecation in OpenAPI spec --- docs/_static/llama-stack-spec.html | 96 +++++++++++++++++++ docs/_static/llama-stack-spec.yaml | 64 +++++++++++++ docs/openapi_generator/pyopenapi/generator.py | 1 + llama_stack/apis/eval_tasks/eval_tasks.py | 71 ++++++++++++++ 4 files changed, 232 insertions(+) create mode 100644 llama_stack/apis/eval_tasks/eval_tasks.py diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index b93f6a380a..2c5827d37f 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -40,6 +40,7 @@ } ], "paths": { +<<<<<<< HEAD "/v1/eval/tasks/{task_id}/evaluations": { "post": { "responses": { @@ -234,6 +235,8 @@ "deprecated": true } }, +======= +>>>>>>> 974941be (deprecation in OpenAPI spec) "/v1/eval-tasks": { "get": { "responses": { @@ -242,18 +245,27 @@ "content": { "application/json": { "schema": { +<<<<<<< HEAD "$ref": "#/components/schemas/ListBenchmarksResponse" +======= + "$ref": "#/components/schemas/ListEvalTasksResponse" +>>>>>>> 974941be (deprecation in OpenAPI spec) } } } } }, "tags": [ +<<<<<<< HEAD "Benchmarks" +======= + "EvalTasks" +>>>>>>> 974941be (deprecation in OpenAPI spec) ], "description": "", "parameters": [], "deprecated": true +<<<<<<< HEAD }, "post": { "responses": { @@ -318,6 +330,8 @@ "required": true }, "deprecated": true +======= +>>>>>>> 974941be (deprecation in OpenAPI spec) } }, "/v1/datasetio/rows": { @@ -2645,7 +2659,89 @@ "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema", "components": { "schemas": { +<<<<<<< HEAD "AgentCandidate": { +======= + "EvalTask": { + "type": "object", + "properties": { + "identifier": { + "type": "string" + }, + "provider_resource_id": { + "type": "string" + }, + "provider_id": { + "type": "string" + }, + "type": { + "type": "string", + "const": "eval_task", + "default": "eval_task" + }, + "dataset_id": { + "type": "string" + }, + "scoring_functions": { + "type": "array", + "items": { + "type": "string" + } + }, + "metadata": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "identifier", + "provider_resource_id", + "provider_id", + "type", + "dataset_id", + "scoring_functions", + "metadata" + ] + }, + "ListEvalTasksResponse": { + "type": "object", + "properties": { + "data": { + "type": "array", + "items": { + "$ref": "#/components/schemas/EvalTask" + } + } + }, + "additionalProperties": false, + "required": [ + "data" + ] + }, + "AppendRowsRequest": { +>>>>>>> 974941be (deprecation in OpenAPI spec) "type": "object", "properties": { "type": { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index b30025020b..c743ce47aa 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -10,6 +10,7 @@ info: servers: - url: http://any-hosted-llama-stack.com paths: +<<<<<<< HEAD /v1/eval/tasks/{task_id}/evaluations: post: responses: @@ -125,6 +126,8 @@ paths: schema: type: string deprecated: true +======= +>>>>>>> 974941be (deprecation in OpenAPI spec) /v1/eval-tasks: get: responses: @@ -133,6 +136,7 @@ paths: content: application/json: schema: +<<<<<<< HEAD $ref: '#/components/schemas/ListBenchmarksResponse' tags: - Benchmarks @@ -179,6 +183,14 @@ paths: $ref: '#/components/schemas/DeprecatedRunEvalRequest' required: true deprecated: true +======= + $ref: '#/components/schemas/ListEvalTasksResponse' + tags: + - EvalTasks + description: '' + parameters: [] + deprecated: true +>>>>>>> 974941be (deprecation in OpenAPI spec) /v1/datasetio/rows: get: responses: @@ -1598,7 +1610,59 @@ jsonSchemaDialect: >- https://json-schema.org/draft/2020-12/schema components: schemas: +<<<<<<< HEAD AgentCandidate: +======= + EvalTask: + type: object + properties: + identifier: + type: string + provider_resource_id: + type: string + provider_id: + type: string + type: + type: string + const: eval_task + default: eval_task + dataset_id: + type: string + scoring_functions: + type: array + items: + type: string + metadata: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - identifier + - provider_resource_id + - provider_id + - type + - dataset_id + - scoring_functions + - metadata + ListEvalTasksResponse: + type: object + properties: + data: + type: array + items: + $ref: '#/components/schemas/EvalTask' + additionalProperties: false + required: + - data + AppendRowsRequest: +>>>>>>> 974941be (deprecation in OpenAPI spec) type: object properties: type: diff --git a/docs/openapi_generator/pyopenapi/generator.py b/docs/openapi_generator/pyopenapi/generator.py index e37c45690c..0f3b997848 100644 --- a/docs/openapi_generator/pyopenapi/generator.py +++ b/docs/openapi_generator/pyopenapi/generator.py @@ -647,6 +647,7 @@ def _build_operation(self, op: EndpointOperation) -> Operation: description = "\n".join( filter(None, [doc_string.short_description, doc_string.long_description]) ) + return Operation( tags=[op.defining_class.__name__], summary=None, diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py new file mode 100644 index 0000000000..9a26fd0c0d --- /dev/null +++ b/llama_stack/apis/eval_tasks/eval_tasks.py @@ -0,0 +1,71 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable + +from llama_models.schema_utils import json_schema_type, webmethod +from pydantic import BaseModel, Field + +from llama_stack.apis.resource import Resource, ResourceType + + +class CommonEvalTaskFields(BaseModel): + dataset_id: str + scoring_functions: List[str] + metadata: Dict[str, Any] = Field( + default_factory=dict, + description="Metadata for this evaluation task", + ) + + +@json_schema_type +class EvalTask(CommonEvalTaskFields, Resource): + type: Literal[ResourceType.eval_task.value] = ResourceType.eval_task.value + + @property + def task_id(self) -> str: + return self.identifier + + @property + def provider_eval_task_id(self) -> str: + return self.provider_resource_id + + +class EvalTaskInput(CommonEvalTaskFields, BaseModel): + task_id: str + provider_id: Optional[str] = None + provider_eval_task_id: Optional[str] = None + + +class ListEvalTasksResponse(BaseModel): + data: List[EvalTask] + + +@runtime_checkable +class EvalTasks(Protocol): + @webmethod(route="/eval-tasks", method="GET") + async def DEPRECATED_list_eval_tasks_deprecated( + self, + ) -> ListEvalTasksResponse: ... + + @webmethod(route="/eval/tasks", method="GET") + async def list_eval_tasks(self) -> ListEvalTasksResponse: ... + + @webmethod(route="/eval/tasks/{task_id}", method="GET") + async def get_eval_task( + self, + task_id: str, + ) -> Optional[EvalTask]: ... + + @webmethod(route="/eval/tasks", method="POST") + async def register_eval_task( + self, + task_id: str, + dataset_id: str, + scoring_functions: List[str], + provider_eval_task_id: Optional[str] = None, + provider_id: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> None: ... From 9ce00ede9bb2f1db7a91d6d4379886294cad287a Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Thu, 13 Feb 2025 09:48:52 -0800 Subject: [PATCH 26/31] update --- docs/_static/llama-stack-spec.html | 55 +++---------------- docs/_static/llama-stack-spec.yaml | 13 +---- .../Llama_Stack_Benchmark_Evals.ipynb | 2 +- .../distribution/routers/routing_tables.py | 6 ++ 4 files changed, 18 insertions(+), 58 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 2c5827d37f..6cd8b47581 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -2665,15 +2665,6 @@ "EvalTask": { "type": "object", "properties": { - "identifier": { - "type": "string" - }, - "provider_resource_id": { - "type": "string" - }, - "provider_id": { - "type": "string" - }, "type": { "type": "string", "const": "eval_task", @@ -2682,53 +2673,23 @@ "dataset_id": { "type": "string" }, - "scoring_functions": { - "type": "array", - "items": { - "type": "string" - } - }, - "metadata": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } + "config": { + "$ref": "#/components/schemas/AgentConfig" } }, "additionalProperties": false, "required": [ - "identifier", - "provider_resource_id", - "provider_id", "type", - "dataset_id", - "scoring_functions", - "metadata" + "config" ] }, "ListEvalTasksResponse": { "type": "object", "properties": { - "data": { + "sampling_params": { + "$ref": "#/components/schemas/SamplingParams" + }, + "input_shields": { "type": "array", "items": { "$ref": "#/components/schemas/EvalTask" @@ -2768,7 +2729,7 @@ "input_shields": { "type": "array", "items": { - "type": "string" + "$ref": "#/components/schemas/ToolDef" } }, "output_shields": { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index c743ce47aa..19980de99a 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -1616,12 +1616,6 @@ components: EvalTask: type: object properties: - identifier: - type: string - provider_resource_id: - type: string - provider_id: - type: string type: type: string const: eval_task @@ -1644,9 +1638,6 @@ components: - type: object additionalProperties: false required: - - identifier - - provider_resource_id - - provider_id - type - dataset_id - scoring_functions @@ -1654,7 +1645,9 @@ components: ListEvalTasksResponse: type: object properties: - data: + sampling_params: + $ref: '#/components/schemas/SamplingParams' + input_shields: type: array items: $ref: '#/components/schemas/EvalTask' diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb index 8eecf84abb..2861c8499d 100644 --- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb +++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb @@ -1214,7 +1214,7 @@ " \"sampling_params\": {\n", " \"strategy\": {\n", " \"type\": \"greedy\",\n", - " },\n", + " },b\n", " \"max_tokens\": 4096,\n", " \"repeat_penalty\": 1.0,\n", " },\n", diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index ec258af491..563c5c5ab1 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -472,16 +472,20 @@ async def register_benchmark( async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse: logger.warning("DEPRECATED: Use /eval/benchmarks instead") return await self.list_benchmarks() + return await self.list_benchmarks() async def DEPRECATED_get_eval_task( self, + task_id: str, eval_task_id: str, ) -> Optional[Benchmark]: logger.warning("DEPRECATED: Use /eval/benchmarks instead") + return await self.get_benchmark(task_id) return await self.get_benchmark(eval_task_id) async def DEPRECATED_register_eval_task( self, + task_id: str, eval_task_id: str, dataset_id: str, scoring_functions: List[str], @@ -490,6 +494,8 @@ async def DEPRECATED_register_eval_task( metadata: Optional[Dict[str, Any]] = None, ) -> None: logger.warning("DEPRECATED: Use /eval/benchmarks instead") + return await self.register_benchmark( + benchmark_id=task_id, return await self.register_benchmark( benchmark_id=eval_task_id, dataset_id=dataset_id, From 39980dc83f9c43e349156ad33acffc758f51db4d Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Feb 2025 21:39:52 -0800 Subject: [PATCH 27/31] openapi --- docs/_static/llama-stack-spec.yaml | 51 +++++++++++++++++++----------- 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 19980de99a..c36c6e2571 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -2922,34 +2922,47 @@ components: TextDelta: type: object properties: - type: + name: type: string - const: text - default: text - text: + description: type: string + parameters: + type: array + items: + $ref: '#/components/schemas/ToolParameter' + metadata: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object additionalProperties: false required: - - type - - text - ToolCallDelta: + - name + ToolParameter: type: object properties: - type: + name: type: string - const: tool_call - default: tool_call - tool_call: + parameter_type: + type: string + description: + type: string + required: + type: boolean + default: true + default: oneOf: + - type: 'null' + - type: boolean + - type: number - type: string - - $ref: '#/components/schemas/ToolCall' - parse_status: - type: string - enum: - - started - - in_progress - - failed - - succeeded + - type: array + - type: object additionalProperties: false required: - type From 139d5bded62de6c6df1cbc796ba22bb156d050c4 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Thu, 13 Feb 2025 09:53:03 -0800 Subject: [PATCH 28/31] update --- docs/_static/llama-stack-spec.html | 59 +--------------- docs/_static/llama-stack-spec.yaml | 108 +++++------------------------ 2 files changed, 20 insertions(+), 147 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 6cd8b47581..b93f6a380a 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -40,7 +40,6 @@ } ], "paths": { -<<<<<<< HEAD "/v1/eval/tasks/{task_id}/evaluations": { "post": { "responses": { @@ -235,8 +234,6 @@ "deprecated": true } }, -======= ->>>>>>> 974941be (deprecation in OpenAPI spec) "/v1/eval-tasks": { "get": { "responses": { @@ -245,27 +242,18 @@ "content": { "application/json": { "schema": { -<<<<<<< HEAD "$ref": "#/components/schemas/ListBenchmarksResponse" -======= - "$ref": "#/components/schemas/ListEvalTasksResponse" ->>>>>>> 974941be (deprecation in OpenAPI spec) } } } } }, "tags": [ -<<<<<<< HEAD "Benchmarks" -======= - "EvalTasks" ->>>>>>> 974941be (deprecation in OpenAPI spec) ], "description": "", "parameters": [], "deprecated": true -<<<<<<< HEAD }, "post": { "responses": { @@ -330,8 +318,6 @@ "required": true }, "deprecated": true -======= ->>>>>>> 974941be (deprecation in OpenAPI spec) } }, "/v1/datasetio/rows": { @@ -2659,50 +2645,7 @@ "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema", "components": { "schemas": { -<<<<<<< HEAD "AgentCandidate": { -======= - "EvalTask": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "eval_task", - "default": "eval_task" - }, - "dataset_id": { - "type": "string" - }, - "config": { - "$ref": "#/components/schemas/AgentConfig" - } - }, - "additionalProperties": false, - "required": [ - "type", - "config" - ] - }, - "ListEvalTasksResponse": { - "type": "object", - "properties": { - "sampling_params": { - "$ref": "#/components/schemas/SamplingParams" - }, - "input_shields": { - "type": "array", - "items": { - "$ref": "#/components/schemas/EvalTask" - } - } - }, - "additionalProperties": false, - "required": [ - "data" - ] - }, - "AppendRowsRequest": { ->>>>>>> 974941be (deprecation in OpenAPI spec) "type": "object", "properties": { "type": { @@ -2729,7 +2672,7 @@ "input_shields": { "type": "array", "items": { - "$ref": "#/components/schemas/ToolDef" + "type": "string" } }, "output_shields": { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index c36c6e2571..b30025020b 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -10,7 +10,6 @@ info: servers: - url: http://any-hosted-llama-stack.com paths: -<<<<<<< HEAD /v1/eval/tasks/{task_id}/evaluations: post: responses: @@ -126,8 +125,6 @@ paths: schema: type: string deprecated: true -======= ->>>>>>> 974941be (deprecation in OpenAPI spec) /v1/eval-tasks: get: responses: @@ -136,7 +133,6 @@ paths: content: application/json: schema: -<<<<<<< HEAD $ref: '#/components/schemas/ListBenchmarksResponse' tags: - Benchmarks @@ -183,14 +179,6 @@ paths: $ref: '#/components/schemas/DeprecatedRunEvalRequest' required: true deprecated: true -======= - $ref: '#/components/schemas/ListEvalTasksResponse' - tags: - - EvalTasks - description: '' - parameters: [] - deprecated: true ->>>>>>> 974941be (deprecation in OpenAPI spec) /v1/datasetio/rows: get: responses: @@ -1610,52 +1598,7 @@ jsonSchemaDialect: >- https://json-schema.org/draft/2020-12/schema components: schemas: -<<<<<<< HEAD AgentCandidate: -======= - EvalTask: - type: object - properties: - type: - type: string - const: eval_task - default: eval_task - dataset_id: - type: string - scoring_functions: - type: array - items: - type: string - metadata: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - additionalProperties: false - required: - - type - - dataset_id - - scoring_functions - - metadata - ListEvalTasksResponse: - type: object - properties: - sampling_params: - $ref: '#/components/schemas/SamplingParams' - input_shields: - type: array - items: - $ref: '#/components/schemas/EvalTask' - additionalProperties: false - required: - - data - AppendRowsRequest: ->>>>>>> 974941be (deprecation in OpenAPI spec) type: object properties: type: @@ -2922,47 +2865,34 @@ components: TextDelta: type: object properties: - name: + type: type: string - description: + const: text + default: text + text: type: string - parameters: - type: array - items: - $ref: '#/components/schemas/ToolParameter' - metadata: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object additionalProperties: false required: - - name - ToolParameter: + - type + - text + ToolCallDelta: type: object properties: - name: - type: string - parameter_type: - type: string - description: + type: type: string - required: - type: boolean - default: true - default: + const: tool_call + default: tool_call + tool_call: oneOf: - - type: 'null' - - type: boolean - - type: number - type: string - - type: array - - type: object + - $ref: '#/components/schemas/ToolCall' + parse_status: + type: string + enum: + - started + - in_progress + - failed + - succeeded additionalProperties: false required: - type From e183ec988f93bb2d89dc5526ea7937129ddd9f17 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Thu, 13 Feb 2025 09:58:10 -0800 Subject: [PATCH 29/31] update --- docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb | 2 +- llama_stack/distribution/routers/routing_tables.py | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb index 2861c8499d..8eecf84abb 100644 --- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb +++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb @@ -1214,7 +1214,7 @@ " \"sampling_params\": {\n", " \"strategy\": {\n", " \"type\": \"greedy\",\n", - " },b\n", + " },\n", " \"max_tokens\": 4096,\n", " \"repeat_penalty\": 1.0,\n", " },\n", diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index 563c5c5ab1..ec258af491 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -472,20 +472,16 @@ async def register_benchmark( async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse: logger.warning("DEPRECATED: Use /eval/benchmarks instead") return await self.list_benchmarks() - return await self.list_benchmarks() async def DEPRECATED_get_eval_task( self, - task_id: str, eval_task_id: str, ) -> Optional[Benchmark]: logger.warning("DEPRECATED: Use /eval/benchmarks instead") - return await self.get_benchmark(task_id) return await self.get_benchmark(eval_task_id) async def DEPRECATED_register_eval_task( self, - task_id: str, eval_task_id: str, dataset_id: str, scoring_functions: List[str], @@ -494,8 +490,6 @@ async def DEPRECATED_register_eval_task( metadata: Optional[Dict[str, Any]] = None, ) -> None: logger.warning("DEPRECATED: Use /eval/benchmarks instead") - return await self.register_benchmark( - benchmark_id=task_id, return await self.register_benchmark( benchmark_id=eval_task_id, dataset_id=dataset_id, From c56db9e7b258f3cfd21fa6156ea8c8ef3f68dfb5 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Thu, 13 Feb 2025 10:11:18 -0800 Subject: [PATCH 30/31] compeltely remove eval_task --- llama_stack/apis/eval_tasks/eval_tasks.py | 71 ----------------------- 1 file changed, 71 deletions(-) delete mode 100644 llama_stack/apis/eval_tasks/eval_tasks.py diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py deleted file mode 100644 index 9a26fd0c0d..0000000000 --- a/llama_stack/apis/eval_tasks/eval_tasks.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. -from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable - -from llama_models.schema_utils import json_schema_type, webmethod -from pydantic import BaseModel, Field - -from llama_stack.apis.resource import Resource, ResourceType - - -class CommonEvalTaskFields(BaseModel): - dataset_id: str - scoring_functions: List[str] - metadata: Dict[str, Any] = Field( - default_factory=dict, - description="Metadata for this evaluation task", - ) - - -@json_schema_type -class EvalTask(CommonEvalTaskFields, Resource): - type: Literal[ResourceType.eval_task.value] = ResourceType.eval_task.value - - @property - def task_id(self) -> str: - return self.identifier - - @property - def provider_eval_task_id(self) -> str: - return self.provider_resource_id - - -class EvalTaskInput(CommonEvalTaskFields, BaseModel): - task_id: str - provider_id: Optional[str] = None - provider_eval_task_id: Optional[str] = None - - -class ListEvalTasksResponse(BaseModel): - data: List[EvalTask] - - -@runtime_checkable -class EvalTasks(Protocol): - @webmethod(route="/eval-tasks", method="GET") - async def DEPRECATED_list_eval_tasks_deprecated( - self, - ) -> ListEvalTasksResponse: ... - - @webmethod(route="/eval/tasks", method="GET") - async def list_eval_tasks(self) -> ListEvalTasksResponse: ... - - @webmethod(route="/eval/tasks/{task_id}", method="GET") - async def get_eval_task( - self, - task_id: str, - ) -> Optional[EvalTask]: ... - - @webmethod(route="/eval/tasks", method="POST") - async def register_eval_task( - self, - task_id: str, - dataset_id: str, - scoring_functions: List[str], - provider_eval_task_id: Optional[str] = None, - provider_id: Optional[str] = None, - metadata: Optional[Dict[str, Any]] = None, - ) -> None: ... From b0ad0c109014cf86be258651fd8ecc8ebc210b90 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Thu, 13 Feb 2025 10:13:10 -0800 Subject: [PATCH 31/31] precommit --- llama_stack/apis/telemetry/telemetry.py | 2 +- llama_stack/distribution/datatypes.py | 1 - llama_stack/distribution/routers/routing_tables.py | 1 - llama_stack/providers/datatypes.py | 1 - llama_stack/providers/inline/eval/meta_reference/eval.py | 1 - llama_stack/providers/tests/resolver.py | 1 - 6 files changed, 1 insertion(+), 6 deletions(-) diff --git a/llama_stack/apis/telemetry/telemetry.py b/llama_stack/apis/telemetry/telemetry.py index 5622aaeac8..63ae1dc738 100644 --- a/llama_stack/apis/telemetry/telemetry.py +++ b/llama_stack/apis/telemetry/telemetry.py @@ -13,8 +13,8 @@ Literal, Optional, Protocol, - runtime_checkable, Union, + runtime_checkable, ) from llama_models.llama3.api.datatypes import Primitive diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py index 75ab73b9ba..f62996081b 100644 --- a/llama_stack/distribution/datatypes.py +++ b/llama_stack/distribution/datatypes.py @@ -9,7 +9,6 @@ from pydantic import BaseModel, Field from llama_stack.apis.benchmarks import Benchmark, BenchmarkInput - from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Dataset, DatasetInput from llama_stack.apis.eval import Eval diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index ec258af491..2cddc3970d 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -10,7 +10,6 @@ from pydantic import TypeAdapter from llama_stack.apis.benchmarks import Benchmark, Benchmarks, ListBenchmarksResponse - from llama_stack.apis.common.content_types import URL from llama_stack.apis.common.type_system import ParamType from llama_stack.apis.datasets import Dataset, Datasets, ListDatasetsResponse diff --git a/llama_stack/providers/datatypes.py b/llama_stack/providers/datatypes.py index 5d56505af1..b92f9dc0a0 100644 --- a/llama_stack/providers/datatypes.py +++ b/llama_stack/providers/datatypes.py @@ -11,7 +11,6 @@ from pydantic import BaseModel, Field from llama_stack.apis.benchmarks import Benchmark - from llama_stack.apis.datasets import Dataset from llama_stack.apis.datatypes import Api from llama_stack.apis.models import Model diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py index 9827ff2081..cd99c9ad89 100644 --- a/llama_stack/providers/inline/eval/meta_reference/eval.py +++ b/llama_stack/providers/inline/eval/meta_reference/eval.py @@ -15,7 +15,6 @@ from llama_stack.apis.scoring import Scoring from llama_stack.distribution.datatypes import Api from llama_stack.providers.datatypes import BenchmarksProtocolPrivate - from llama_stack.providers.inline.agents.meta_reference.agent_instance import ( MEMORY_QUERY_TOOL, ) diff --git a/llama_stack/providers/tests/resolver.py b/llama_stack/providers/tests/resolver.py index 092514079a..76343b7f48 100644 --- a/llama_stack/providers/tests/resolver.py +++ b/llama_stack/providers/tests/resolver.py @@ -11,7 +11,6 @@ from pydantic import BaseModel from llama_stack.apis.benchmarks import BenchmarkInput - from llama_stack.apis.datasets import DatasetInput from llama_stack.apis.models import ModelInput from llama_stack.apis.scoring_functions import ScoringFnInput