From 8bdafa997a30877e8e6119804ad069c7cd6afb44 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Thu, 20 Feb 2025 14:30:05 -0800
Subject: [PATCH] Sync updates from stainless branch: main

---
 src/llama_stack_client/_client.py             |  9 --
 src/llama_stack_client/resources/__init__.py  | 14 ---
 src/llama_stack_client/resources/eval/eval.py | 32 +++----
 src/llama_stack_client/types/__init__.py      |  2 -
 .../types/benchmark_config_param.py           |  4 +-
 tests/api_resources/test_eval.py              | 88 +++++--------------
 6 files changed, 41 insertions(+), 108 deletions(-)

diff --git a/src/llama_stack_client/_client.py b/src/llama_stack_client/_client.py
index 82353ebd..bb5bb755 100644
--- a/src/llama_stack_client/_client.py
+++ b/src/llama_stack_client/_client.py
@@ -40,7 +40,6 @@
     telemetry,
     vector_io,
     benchmarks,
-    eval_tasks,
     toolgroups,
     vector_dbs,
     batch_inference,
@@ -94,7 +93,6 @@ class LlamaStackClient(SyncAPIClient):
     datasetio: datasetio.DatasetioResource
     scoring: scoring.ScoringResource
     scoring_functions: scoring_functions.ScoringFunctionsResource
-    eval_tasks: eval_tasks.EvalTasksResource
     benchmarks: benchmarks.BenchmarksResource
     with_raw_response: LlamaStackClientWithRawResponse
     with_streaming_response: LlamaStackClientWithStreamedResponse
@@ -177,7 +175,6 @@ def __init__(
         self.datasetio = datasetio.DatasetioResource(self)
         self.scoring = scoring.ScoringResource(self)
         self.scoring_functions = scoring_functions.ScoringFunctionsResource(self)
-        self.eval_tasks = eval_tasks.EvalTasksResource(self)
         self.benchmarks = benchmarks.BenchmarksResource(self)
         self.with_raw_response = LlamaStackClientWithRawResponse(self)
         self.with_streaming_response = LlamaStackClientWithStreamedResponse(self)
@@ -312,7 +309,6 @@ class AsyncLlamaStackClient(AsyncAPIClient):
     datasetio: datasetio.AsyncDatasetioResource
     scoring: scoring.AsyncScoringResource
     scoring_functions: scoring_functions.AsyncScoringFunctionsResource
-    eval_tasks: eval_tasks.AsyncEvalTasksResource
     benchmarks: benchmarks.AsyncBenchmarksResource
     with_raw_response: AsyncLlamaStackClientWithRawResponse
     with_streaming_response: AsyncLlamaStackClientWithStreamedResponse
@@ -395,7 +391,6 @@ def __init__(
         self.datasetio = datasetio.AsyncDatasetioResource(self)
         self.scoring = scoring.AsyncScoringResource(self)
         self.scoring_functions = scoring_functions.AsyncScoringFunctionsResource(self)
-        self.eval_tasks = eval_tasks.AsyncEvalTasksResource(self)
         self.benchmarks = benchmarks.AsyncBenchmarksResource(self)
         self.with_raw_response = AsyncLlamaStackClientWithRawResponse(self)
         self.with_streaming_response = AsyncLlamaStackClientWithStreamedResponse(self)
@@ -533,7 +528,6 @@ def __init__(self, client: LlamaStackClient) -> None:
         self.datasetio = datasetio.DatasetioResourceWithRawResponse(client.datasetio)
         self.scoring = scoring.ScoringResourceWithRawResponse(client.scoring)
         self.scoring_functions = scoring_functions.ScoringFunctionsResourceWithRawResponse(client.scoring_functions)
-        self.eval_tasks = eval_tasks.EvalTasksResourceWithRawResponse(client.eval_tasks)
         self.benchmarks = benchmarks.BenchmarksResourceWithRawResponse(client.benchmarks)
 
 
@@ -565,7 +559,6 @@ def __init__(self, client: AsyncLlamaStackClient) -> None:
         self.scoring_functions = scoring_functions.AsyncScoringFunctionsResourceWithRawResponse(
             client.scoring_functions
         )
-        self.eval_tasks = eval_tasks.AsyncEvalTasksResourceWithRawResponse(client.eval_tasks)
         self.benchmarks = benchmarks.AsyncBenchmarksResourceWithRawResponse(client.benchmarks)
 
 
@@ -597,7 +590,6 @@ def __init__(self, client: LlamaStackClient) -> None:
         self.scoring_functions = scoring_functions.ScoringFunctionsResourceWithStreamingResponse(
             client.scoring_functions
         )
-        self.eval_tasks = eval_tasks.EvalTasksResourceWithStreamingResponse(client.eval_tasks)
         self.benchmarks = benchmarks.BenchmarksResourceWithStreamingResponse(client.benchmarks)
 
 
@@ -631,7 +623,6 @@ def __init__(self, client: AsyncLlamaStackClient) -> None:
         self.scoring_functions = scoring_functions.AsyncScoringFunctionsResourceWithStreamingResponse(
             client.scoring_functions
         )
-        self.eval_tasks = eval_tasks.AsyncEvalTasksResourceWithStreamingResponse(client.eval_tasks)
         self.benchmarks = benchmarks.AsyncBenchmarksResourceWithStreamingResponse(client.benchmarks)
 
 
diff --git a/src/llama_stack_client/resources/__init__.py b/src/llama_stack_client/resources/__init__.py
index b5e449c9..449fb4a1 100644
--- a/src/llama_stack_client/resources/__init__.py
+++ b/src/llama_stack_client/resources/__init__.py
@@ -128,14 +128,6 @@
     BenchmarksResourceWithStreamingResponse,
     AsyncBenchmarksResourceWithStreamingResponse,
 )
-from .eval_tasks import (
-    EvalTasksResource,
-    AsyncEvalTasksResource,
-    EvalTasksResourceWithRawResponse,
-    AsyncEvalTasksResourceWithRawResponse,
-    EvalTasksResourceWithStreamingResponse,
-    AsyncEvalTasksResourceWithStreamingResponse,
-)
 from .toolgroups import (
     ToolgroupsResource,
     AsyncToolgroupsResource,
@@ -326,12 +318,6 @@
     "AsyncScoringFunctionsResourceWithRawResponse",
     "ScoringFunctionsResourceWithStreamingResponse",
     "AsyncScoringFunctionsResourceWithStreamingResponse",
-    "EvalTasksResource",
-    "AsyncEvalTasksResource",
-    "EvalTasksResourceWithRawResponse",
-    "AsyncEvalTasksResourceWithRawResponse",
-    "EvalTasksResourceWithStreamingResponse",
-    "AsyncEvalTasksResourceWithStreamingResponse",
     "BenchmarksResource",
     "AsyncBenchmarksResource",
     "BenchmarksResourceWithRawResponse",
diff --git a/src/llama_stack_client/resources/eval/eval.py b/src/llama_stack_client/resources/eval/eval.py
index 053d2398..6ea1669c 100644
--- a/src/llama_stack_client/resources/eval/eval.py
+++ b/src/llama_stack_client/resources/eval/eval.py
@@ -67,7 +67,7 @@ def with_streaming_response(self) -> EvalResourceWithStreamingResponse:
 
     def evaluate_rows(
         self,
-        task_id: str,
+        benchmark_id: str,
         *,
         input_rows: Iterable[Dict[str, Union[bool, float, str, Iterable[object], object, None]]],
         scoring_functions: List[str],
@@ -89,10 +89,10 @@ def evaluate_rows(
 
           timeout: Override the client-level default timeout for this request, in seconds
         """
-        if not task_id:
-            raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
+        if not benchmark_id:
+            raise ValueError(f"Expected a non-empty value for `benchmark_id` but received {benchmark_id!r}")
         return self._post(
-            f"/v1/eval/tasks/{task_id}/evaluations",
+            f"/v1/eval/benchmarks/{benchmark_id}/evaluations",
             body=maybe_transform(
                 {
                     "input_rows": input_rows,
@@ -151,7 +151,7 @@ def evaluate_rows_alpha(
 
     def run_eval(
         self,
-        task_id: str,
+        benchmark_id: str,
         *,
         task_config: BenchmarkConfigParam,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
@@ -171,10 +171,10 @@ def run_eval(
 
           timeout: Override the client-level default timeout for this request, in seconds
         """
-        if not task_id:
-            raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
+        if not benchmark_id:
+            raise ValueError(f"Expected a non-empty value for `benchmark_id` but received {benchmark_id!r}")
         return self._post(
-            f"/v1/eval/tasks/{task_id}/jobs",
+            f"/v1/eval/benchmarks/{benchmark_id}/jobs",
             body=maybe_transform({"task_config": task_config}, eval_run_eval_params.EvalRunEvalParams),
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
@@ -242,7 +242,7 @@ def with_streaming_response(self) -> AsyncEvalResourceWithStreamingResponse:
 
     async def evaluate_rows(
         self,
-        task_id: str,
+        benchmark_id: str,
         *,
         input_rows: Iterable[Dict[str, Union[bool, float, str, Iterable[object], object, None]]],
         scoring_functions: List[str],
@@ -264,10 +264,10 @@ async def evaluate_rows(
 
           timeout: Override the client-level default timeout for this request, in seconds
         """
-        if not task_id:
-            raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
+        if not benchmark_id:
+            raise ValueError(f"Expected a non-empty value for `benchmark_id` but received {benchmark_id!r}")
         return await self._post(
-            f"/v1/eval/tasks/{task_id}/evaluations",
+            f"/v1/eval/benchmarks/{benchmark_id}/evaluations",
             body=await async_maybe_transform(
                 {
                     "input_rows": input_rows,
@@ -326,7 +326,7 @@ async def evaluate_rows_alpha(
 
     async def run_eval(
         self,
-        task_id: str,
+        benchmark_id: str,
         *,
         task_config: BenchmarkConfigParam,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
@@ -346,10 +346,10 @@ async def run_eval(
 
           timeout: Override the client-level default timeout for this request, in seconds
         """
-        if not task_id:
-            raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
+        if not benchmark_id:
+            raise ValueError(f"Expected a non-empty value for `benchmark_id` but received {benchmark_id!r}")
         return await self._post(
-            f"/v1/eval/tasks/{task_id}/jobs",
+            f"/v1/eval/benchmarks/{benchmark_id}/jobs",
             body=await async_maybe_transform({"task_config": task_config}, eval_run_eval_params.EvalRunEvalParams),
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
diff --git a/src/llama_stack_client/types/__init__.py b/src/llama_stack_client/types/__init__.py
index bc94eb13..ed400c28 100644
--- a/src/llama_stack_client/types/__init__.py
+++ b/src/llama_stack_client/types/__init__.py
@@ -86,7 +86,6 @@
 from .vector_io_query_params import VectorIoQueryParams as VectorIoQueryParams
 from .benchmark_list_response import BenchmarkListResponse as BenchmarkListResponse
 from .dataset_register_params import DatasetRegisterParams as DatasetRegisterParams
-from .eval_task_list_response import EvalTaskListResponse as EvalTaskListResponse
 from .list_providers_response import ListProvidersResponse as ListProvidersResponse
 from .scoring_fn_params_param import ScoringFnParamsParam as ScoringFnParamsParam
 from .toolgroup_list_response import ToolgroupListResponse as ToolgroupListResponse
@@ -98,7 +97,6 @@
 from .benchmark_register_params import BenchmarkRegisterParams as BenchmarkRegisterParams
 from .dataset_retrieve_response import DatasetRetrieveResponse as DatasetRetrieveResponse
 from .eval_evaluate_rows_params import EvalEvaluateRowsParams as EvalEvaluateRowsParams
-from .eval_task_register_params import EvalTaskRegisterParams as EvalTaskRegisterParams
 from .list_tool_groups_response import ListToolGroupsResponse as ListToolGroupsResponse
 from .toolgroup_register_params import ToolgroupRegisterParams as ToolgroupRegisterParams
 from .vector_db_register_params import VectorDBRegisterParams as VectorDBRegisterParams
diff --git a/src/llama_stack_client/types/benchmark_config_param.py b/src/llama_stack_client/types/benchmark_config_param.py
index 48090c5f..f958833a 100644
--- a/src/llama_stack_client/types/benchmark_config_param.py
+++ b/src/llama_stack_client/types/benchmark_config_param.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 from typing import Dict
-from typing_extensions import Literal, Required, TypedDict
+from typing_extensions import Required, TypedDict
 
 from .eval_candidate_param import EvalCandidateParam
 from .scoring_fn_params_param import ScoringFnParamsParam
@@ -16,6 +16,4 @@ class BenchmarkConfigParam(TypedDict, total=False):
 
     scoring_params: Required[Dict[str, ScoringFnParamsParam]]
 
-    type: Required[Literal["benchmark"]]
-
     num_examples: int
diff --git a/tests/api_resources/test_eval.py b/tests/api_resources/test_eval.py
index de5d0cac..cf120885 100644
--- a/tests/api_resources/test_eval.py
+++ b/tests/api_resources/test_eval.py
@@ -23,7 +23,7 @@ class TestEval:
     @parametrize
     def test_method_evaluate_rows(self, client: LlamaStackClient) -> None:
         eval = client.eval.evaluate_rows(
-            task_id="task_id",
+            benchmark_id="benchmark_id",
             input_rows=[{"foo": True}],
             scoring_functions=["string"],
             task_config={
@@ -38,7 +38,6 @@ def test_method_evaluate_rows(self, client: LlamaStackClient) -> None:
                         "type": "llm_as_judge",
                     }
                 },
-                "type": "benchmark",
             },
         )
         assert_matches_type(EvaluateResponse, eval, path=["response"])
@@ -46,7 +45,7 @@ def test_method_evaluate_rows(self, client: LlamaStackClient) -> None:
     @parametrize
     def test_method_evaluate_rows_with_all_params(self, client: LlamaStackClient) -> None:
         eval = client.eval.evaluate_rows(
-            task_id="task_id",
+            benchmark_id="benchmark_id",
             input_rows=[{"foo": True}],
             scoring_functions=["string"],
             task_config={
@@ -72,7 +71,6 @@ def test_method_evaluate_rows_with_all_params(self, client: LlamaStackClient) ->
                         "prompt_template": "prompt_template",
                     }
                 },
-                "type": "benchmark",
                 "num_examples": 0,
             },
         )
@@ -81,7 +79,7 @@ def test_method_evaluate_rows_with_all_params(self, client: LlamaStackClient) ->
     @parametrize
     def test_raw_response_evaluate_rows(self, client: LlamaStackClient) -> None:
         response = client.eval.with_raw_response.evaluate_rows(
-            task_id="task_id",
+            benchmark_id="benchmark_id",
             input_rows=[{"foo": True}],
             scoring_functions=["string"],
             task_config={
@@ -96,7 +94,6 @@ def test_raw_response_evaluate_rows(self, client: LlamaStackClient) -> None:
                         "type": "llm_as_judge",
                     }
                 },
-                "type": "benchmark",
             },
         )
 
@@ -108,7 +105,7 @@ def test_raw_response_evaluate_rows(self, client: LlamaStackClient) -> None:
     @parametrize
     def test_streaming_response_evaluate_rows(self, client: LlamaStackClient) -> None:
         with client.eval.with_streaming_response.evaluate_rows(
-            task_id="task_id",
+            benchmark_id="benchmark_id",
             input_rows=[{"foo": True}],
             scoring_functions=["string"],
             task_config={
@@ -123,7 +120,6 @@ def test_streaming_response_evaluate_rows(self, client: LlamaStackClient) -> Non
                         "type": "llm_as_judge",
                     }
                 },
-                "type": "benchmark",
             },
         ) as response:
             assert not response.is_closed
@@ -136,9 +132,9 @@ def test_streaming_response_evaluate_rows(self, client: LlamaStackClient) -> Non
 
     @parametrize
     def test_path_params_evaluate_rows(self, client: LlamaStackClient) -> None:
-        with pytest.raises(ValueError, match=r"Expected a non-empty value for `task_id` but received ''"):
+        with pytest.raises(ValueError, match=r"Expected a non-empty value for `benchmark_id` but received ''"):
             client.eval.with_raw_response.evaluate_rows(
-                task_id="",
+                benchmark_id="",
                 input_rows=[{"foo": True}],
                 scoring_functions=["string"],
                 task_config={
@@ -153,7 +149,6 @@ def test_path_params_evaluate_rows(self, client: LlamaStackClient) -> None:
                             "type": "llm_as_judge",
                         }
                     },
-                    "type": "benchmark",
                 },
             )
 
@@ -175,7 +170,6 @@ def test_method_evaluate_rows_alpha(self, client: LlamaStackClient) -> None:
                         "type": "llm_as_judge",
                     }
                 },
-                "type": "benchmark",
             },
         )
         assert_matches_type(EvaluateResponse, eval, path=["response"])
@@ -209,7 +203,6 @@ def test_method_evaluate_rows_alpha_with_all_params(self, client: LlamaStackClie
                         "prompt_template": "prompt_template",
                     }
                 },
-                "type": "benchmark",
                 "num_examples": 0,
             },
         )
@@ -233,7 +226,6 @@ def test_raw_response_evaluate_rows_alpha(self, client: LlamaStackClient) -> Non
                         "type": "llm_as_judge",
                     }
                 },
-                "type": "benchmark",
             },
         )
 
@@ -260,7 +252,6 @@ def test_streaming_response_evaluate_rows_alpha(self, client: LlamaStackClient)
                         "type": "llm_as_judge",
                     }
                 },
-                "type": "benchmark",
             },
         ) as response:
             assert not response.is_closed
@@ -290,14 +281,13 @@ def test_path_params_evaluate_rows_alpha(self, client: LlamaStackClient) -> None
                             "type": "llm_as_judge",
                         }
                     },
-                    "type": "benchmark",
                 },
             )
 
     @parametrize
     def test_method_run_eval(self, client: LlamaStackClient) -> None:
         eval = client.eval.run_eval(
-            task_id="task_id",
+            benchmark_id="benchmark_id",
             task_config={
                 "eval_candidate": {
                     "model": "model",
@@ -310,7 +300,6 @@ def test_method_run_eval(self, client: LlamaStackClient) -> None:
                         "type": "llm_as_judge",
                     }
                 },
-                "type": "benchmark",
             },
         )
         assert_matches_type(Job, eval, path=["response"])
@@ -318,7 +307,7 @@ def test_method_run_eval(self, client: LlamaStackClient) -> None:
     @parametrize
     def test_method_run_eval_with_all_params(self, client: LlamaStackClient) -> None:
         eval = client.eval.run_eval(
-            task_id="task_id",
+            benchmark_id="benchmark_id",
             task_config={
                 "eval_candidate": {
                     "model": "model",
@@ -342,7 +331,6 @@ def test_method_run_eval_with_all_params(self, client: LlamaStackClient) -> None
                         "prompt_template": "prompt_template",
                     }
                 },
-                "type": "benchmark",
                 "num_examples": 0,
             },
         )
@@ -351,7 +339,7 @@ def test_method_run_eval_with_all_params(self, client: LlamaStackClient) -> None
     @parametrize
     def test_raw_response_run_eval(self, client: LlamaStackClient) -> None:
         response = client.eval.with_raw_response.run_eval(
-            task_id="task_id",
+            benchmark_id="benchmark_id",
             task_config={
                 "eval_candidate": {
                     "model": "model",
@@ -364,7 +352,6 @@ def test_raw_response_run_eval(self, client: LlamaStackClient) -> None:
                         "type": "llm_as_judge",
                     }
                 },
-                "type": "benchmark",
             },
         )
 
@@ -376,7 +363,7 @@ def test_raw_response_run_eval(self, client: LlamaStackClient) -> None:
     @parametrize
     def test_streaming_response_run_eval(self, client: LlamaStackClient) -> None:
         with client.eval.with_streaming_response.run_eval(
-            task_id="task_id",
+            benchmark_id="benchmark_id",
             task_config={
                 "eval_candidate": {
                     "model": "model",
@@ -389,7 +376,6 @@ def test_streaming_response_run_eval(self, client: LlamaStackClient) -> None:
                         "type": "llm_as_judge",
                     }
                 },
-                "type": "benchmark",
             },
         ) as response:
             assert not response.is_closed
@@ -402,9 +388,9 @@ def test_streaming_response_run_eval(self, client: LlamaStackClient) -> None:
 
     @parametrize
     def test_path_params_run_eval(self, client: LlamaStackClient) -> None:
-        with pytest.raises(ValueError, match=r"Expected a non-empty value for `task_id` but received ''"):
+        with pytest.raises(ValueError, match=r"Expected a non-empty value for `benchmark_id` but received ''"):
             client.eval.with_raw_response.run_eval(
-                task_id="",
+                benchmark_id="",
                 task_config={
                     "eval_candidate": {
                         "model": "model",
@@ -417,7 +403,6 @@ def test_path_params_run_eval(self, client: LlamaStackClient) -> None:
                             "type": "llm_as_judge",
                         }
                     },
-                    "type": "benchmark",
                 },
             )
 
@@ -437,7 +422,6 @@ def test_method_run_eval_alpha(self, client: LlamaStackClient) -> None:
                         "type": "llm_as_judge",
                     }
                 },
-                "type": "benchmark",
             },
         )
         assert_matches_type(Job, eval, path=["response"])
@@ -469,7 +453,6 @@ def test_method_run_eval_alpha_with_all_params(self, client: LlamaStackClient) -
                         "prompt_template": "prompt_template",
                     }
                 },
-                "type": "benchmark",
                 "num_examples": 0,
             },
         )
@@ -491,7 +474,6 @@ def test_raw_response_run_eval_alpha(self, client: LlamaStackClient) -> None:
                         "type": "llm_as_judge",
                     }
                 },
-                "type": "benchmark",
             },
         )
 
@@ -516,7 +498,6 @@ def test_streaming_response_run_eval_alpha(self, client: LlamaStackClient) -> No
                         "type": "llm_as_judge",
                     }
                 },
-                "type": "benchmark",
             },
         ) as response:
             assert not response.is_closed
@@ -544,7 +525,6 @@ def test_path_params_run_eval_alpha(self, client: LlamaStackClient) -> None:
                             "type": "llm_as_judge",
                         }
                     },
-                    "type": "benchmark",
                 },
             )
 
@@ -555,7 +535,7 @@ class TestAsyncEval:
     @parametrize
     async def test_method_evaluate_rows(self, async_client: AsyncLlamaStackClient) -> None:
         eval = await async_client.eval.evaluate_rows(
-            task_id="task_id",
+            benchmark_id="benchmark_id",
             input_rows=[{"foo": True}],
             scoring_functions=["string"],
             task_config={
@@ -570,7 +550,6 @@ async def test_method_evaluate_rows(self, async_client: AsyncLlamaStackClient) -
                         "type": "llm_as_judge",
                     }
                 },
-                "type": "benchmark",
             },
         )
         assert_matches_type(EvaluateResponse, eval, path=["response"])
@@ -578,7 +557,7 @@ async def test_method_evaluate_rows(self, async_client: AsyncLlamaStackClient) -
     @parametrize
     async def test_method_evaluate_rows_with_all_params(self, async_client: AsyncLlamaStackClient) -> None:
         eval = await async_client.eval.evaluate_rows(
-            task_id="task_id",
+            benchmark_id="benchmark_id",
             input_rows=[{"foo": True}],
             scoring_functions=["string"],
             task_config={
@@ -604,7 +583,6 @@ async def test_method_evaluate_rows_with_all_params(self, async_client: AsyncLla
                         "prompt_template": "prompt_template",
                     }
                 },
-                "type": "benchmark",
                 "num_examples": 0,
             },
         )
@@ -613,7 +591,7 @@ async def test_method_evaluate_rows_with_all_params(self, async_client: AsyncLla
     @parametrize
     async def test_raw_response_evaluate_rows(self, async_client: AsyncLlamaStackClient) -> None:
         response = await async_client.eval.with_raw_response.evaluate_rows(
-            task_id="task_id",
+            benchmark_id="benchmark_id",
             input_rows=[{"foo": True}],
             scoring_functions=["string"],
             task_config={
@@ -628,7 +606,6 @@ async def test_raw_response_evaluate_rows(self, async_client: AsyncLlamaStackCli
                         "type": "llm_as_judge",
                     }
                 },
-                "type": "benchmark",
             },
         )
 
@@ -640,7 +617,7 @@ async def test_raw_response_evaluate_rows(self, async_client: AsyncLlamaStackCli
     @parametrize
     async def test_streaming_response_evaluate_rows(self, async_client: AsyncLlamaStackClient) -> None:
         async with async_client.eval.with_streaming_response.evaluate_rows(
-            task_id="task_id",
+            benchmark_id="benchmark_id",
             input_rows=[{"foo": True}],
             scoring_functions=["string"],
             task_config={
@@ -655,7 +632,6 @@ async def test_streaming_response_evaluate_rows(self, async_client: AsyncLlamaSt
                         "type": "llm_as_judge",
                     }
                 },
-                "type": "benchmark",
             },
         ) as response:
             assert not response.is_closed
@@ -668,9 +644,9 @@ async def test_streaming_response_evaluate_rows(self, async_client: AsyncLlamaSt
 
     @parametrize
     async def test_path_params_evaluate_rows(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.raises(ValueError, match=r"Expected a non-empty value for `task_id` but received ''"):
+        with pytest.raises(ValueError, match=r"Expected a non-empty value for `benchmark_id` but received ''"):
             await async_client.eval.with_raw_response.evaluate_rows(
-                task_id="",
+                benchmark_id="",
                 input_rows=[{"foo": True}],
                 scoring_functions=["string"],
                 task_config={
@@ -685,7 +661,6 @@ async def test_path_params_evaluate_rows(self, async_client: AsyncLlamaStackClie
                             "type": "llm_as_judge",
                         }
                     },
-                    "type": "benchmark",
                 },
             )
 
@@ -707,7 +682,6 @@ async def test_method_evaluate_rows_alpha(self, async_client: AsyncLlamaStackCli
                         "type": "llm_as_judge",
                     }
                 },
-                "type": "benchmark",
             },
         )
         assert_matches_type(EvaluateResponse, eval, path=["response"])
@@ -741,7 +715,6 @@ async def test_method_evaluate_rows_alpha_with_all_params(self, async_client: As
                         "prompt_template": "prompt_template",
                     }
                 },
-                "type": "benchmark",
                 "num_examples": 0,
             },
         )
@@ -765,7 +738,6 @@ async def test_raw_response_evaluate_rows_alpha(self, async_client: AsyncLlamaSt
                         "type": "llm_as_judge",
                     }
                 },
-                "type": "benchmark",
             },
         )
 
@@ -792,7 +764,6 @@ async def test_streaming_response_evaluate_rows_alpha(self, async_client: AsyncL
                         "type": "llm_as_judge",
                     }
                 },
-                "type": "benchmark",
             },
         ) as response:
             assert not response.is_closed
@@ -822,14 +793,13 @@ async def test_path_params_evaluate_rows_alpha(self, async_client: AsyncLlamaSta
                             "type": "llm_as_judge",
                         }
                     },
-                    "type": "benchmark",
                 },
             )
 
     @parametrize
     async def test_method_run_eval(self, async_client: AsyncLlamaStackClient) -> None:
         eval = await async_client.eval.run_eval(
-            task_id="task_id",
+            benchmark_id="benchmark_id",
             task_config={
                 "eval_candidate": {
                     "model": "model",
@@ -842,7 +812,6 @@ async def test_method_run_eval(self, async_client: AsyncLlamaStackClient) -> Non
                         "type": "llm_as_judge",
                     }
                 },
-                "type": "benchmark",
             },
         )
         assert_matches_type(Job, eval, path=["response"])
@@ -850,7 +819,7 @@ async def test_method_run_eval(self, async_client: AsyncLlamaStackClient) -> Non
     @parametrize
     async def test_method_run_eval_with_all_params(self, async_client: AsyncLlamaStackClient) -> None:
         eval = await async_client.eval.run_eval(
-            task_id="task_id",
+            benchmark_id="benchmark_id",
             task_config={
                 "eval_candidate": {
                     "model": "model",
@@ -874,7 +843,6 @@ async def test_method_run_eval_with_all_params(self, async_client: AsyncLlamaSta
                         "prompt_template": "prompt_template",
                     }
                 },
-                "type": "benchmark",
                 "num_examples": 0,
             },
         )
@@ -883,7 +851,7 @@ async def test_method_run_eval_with_all_params(self, async_client: AsyncLlamaSta
     @parametrize
     async def test_raw_response_run_eval(self, async_client: AsyncLlamaStackClient) -> None:
         response = await async_client.eval.with_raw_response.run_eval(
-            task_id="task_id",
+            benchmark_id="benchmark_id",
             task_config={
                 "eval_candidate": {
                     "model": "model",
@@ -896,7 +864,6 @@ async def test_raw_response_run_eval(self, async_client: AsyncLlamaStackClient)
                         "type": "llm_as_judge",
                     }
                 },
-                "type": "benchmark",
             },
         )
 
@@ -908,7 +875,7 @@ async def test_raw_response_run_eval(self, async_client: AsyncLlamaStackClient)
     @parametrize
     async def test_streaming_response_run_eval(self, async_client: AsyncLlamaStackClient) -> None:
         async with async_client.eval.with_streaming_response.run_eval(
-            task_id="task_id",
+            benchmark_id="benchmark_id",
             task_config={
                 "eval_candidate": {
                     "model": "model",
@@ -921,7 +888,6 @@ async def test_streaming_response_run_eval(self, async_client: AsyncLlamaStackCl
                         "type": "llm_as_judge",
                     }
                 },
-                "type": "benchmark",
             },
         ) as response:
             assert not response.is_closed
@@ -934,9 +900,9 @@ async def test_streaming_response_run_eval(self, async_client: AsyncLlamaStackCl
 
     @parametrize
     async def test_path_params_run_eval(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.raises(ValueError, match=r"Expected a non-empty value for `task_id` but received ''"):
+        with pytest.raises(ValueError, match=r"Expected a non-empty value for `benchmark_id` but received ''"):
             await async_client.eval.with_raw_response.run_eval(
-                task_id="",
+                benchmark_id="",
                 task_config={
                     "eval_candidate": {
                         "model": "model",
@@ -949,7 +915,6 @@ async def test_path_params_run_eval(self, async_client: AsyncLlamaStackClient) -
                             "type": "llm_as_judge",
                         }
                     },
-                    "type": "benchmark",
                 },
             )
 
@@ -969,7 +934,6 @@ async def test_method_run_eval_alpha(self, async_client: AsyncLlamaStackClient)
                         "type": "llm_as_judge",
                     }
                 },
-                "type": "benchmark",
             },
         )
         assert_matches_type(Job, eval, path=["response"])
@@ -1001,7 +965,6 @@ async def test_method_run_eval_alpha_with_all_params(self, async_client: AsyncLl
                         "prompt_template": "prompt_template",
                     }
                 },
-                "type": "benchmark",
                 "num_examples": 0,
             },
         )
@@ -1023,7 +986,6 @@ async def test_raw_response_run_eval_alpha(self, async_client: AsyncLlamaStackCl
                         "type": "llm_as_judge",
                     }
                 },
-                "type": "benchmark",
             },
         )
 
@@ -1048,7 +1010,6 @@ async def test_streaming_response_run_eval_alpha(self, async_client: AsyncLlamaS
                         "type": "llm_as_judge",
                     }
                 },
-                "type": "benchmark",
             },
         ) as response:
             assert not response.is_closed
@@ -1076,6 +1037,5 @@ async def test_path_params_run_eval_alpha(self, async_client: AsyncLlamaStackCli
                             "type": "llm_as_judge",
                         }
                     },
-                    "type": "benchmark",
                 },
             )