From 7b9bbad0826f71f3004d5e58cdcd4404ac0294d9 Mon Sep 17 00:00:00 2001
From: Botao Chen <markchen1015@meta.com>
Date: Wed, 19 Feb 2025 23:17:09 -0800
Subject: [PATCH 1/6] temp commit

---
 scripts/open_benchmark/simpleqa.py            |  81 +++++++++++++
 .../lib/cli/eval/run_benchmark.py             | 106 +++++++++++-------
 2 files changed, 145 insertions(+), 42 deletions(-)
 create mode 100644 scripts/open_benchmark/simpleqa.py

diff --git a/scripts/open_benchmark/simpleqa.py b/scripts/open_benchmark/simpleqa.py
new file mode 100644
index 00000000..ee4ac03c
--- /dev/null
+++ b/scripts/open_benchmark/simpleqa.py
@@ -0,0 +1,81 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
+
+import asyncio
+from typing import Optional
+
+import fire
+from llama_stack_client import LlamaStackClient
+
+
+async def run_main(
+    host: str,
+    port: int,
+    model_id: str,
+    use_https: Optional[bool] = False,
+    cert_path: Optional[str] = None,
+):
+
+    # Construct the base URL with the appropriate protocol
+    protocol = "https" if use_https else "http"
+    base_url = f"{protocol}://{host}:{port}"
+
+    # Configure client with SSL certificate if provided
+    client_kwargs = {"base_url": base_url}
+    if use_https and cert_path:
+        client_kwargs["verify"] = cert_path
+
+    client = LlamaStackClient(**client_kwargs)
+
+    eval_rows = client.datasetio.get_rows_paginated(
+        dataset_id="simpleqa",
+        rows_in_page=5,
+    )
+
+    response = client.eval.evaluate_rows_alpha(
+        benchmark_id="meta-reference-simpleqa",
+        input_rows=eval_rows.rows,
+        scoring_functions=["llm-as-judge::405b-simpleqa"],
+        task_config={
+            "type": "benchmark",
+            "eval_candidate": {
+                "type": "model",
+                "model": model_id,
+                "sampling_params": {
+                    "temperature": 0.0,
+                    "max_tokens": 4096,
+                    "top_p": 0.9,
+                    "repeat_penalty": 1.0,
+                },
+            },
+        },
+    )
+
+    print(response)
+
+
+def main(
+    host: str,
+    port: int,
+    model: str,
+    use_https: Optional[bool] = False,
+    cert_path: Optional[str] = None,
+):
+    asyncio.run(
+        run_main(
+            host,
+            port,
+            model,
+            use_https,
+            cert_path,
+        )
+    )
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
diff --git a/src/llama_stack_client/lib/cli/eval/run_benchmark.py b/src/llama_stack_client/lib/cli/eval/run_benchmark.py
index ac03c564..51841f28 100644
--- a/src/llama_stack_client/lib/cli/eval/run_benchmark.py
+++ b/src/llama_stack_client/lib/cli/eval/run_benchmark.py
@@ -16,18 +16,19 @@
 
 
 @click.command("run-benchmark")
-@click.argument("eval-task-ids", nargs=-1, required=True)
-@click.option(
-    "--eval-task-config",
-    required=True,
-    help="Path to the eval task config file in JSON format",
-    type=click.Path(exists=True),
-)
-@click.option(
-    "--output-dir",
-    required=True,
-    help="Path to the dump eval results output directory",
-)
+@click.argument("model_id", required=True)
+@click.argument("benchmark_ids", nargs=-1, required=True)
+# @click.option(
+#     "--eval-task-config",
+#     required=True,
+#     help="Path to the eval task config file in JSON format",
+#     type=click.Path(exists=True),
+# )
+# @click.option(
+#     "--output-dir",
+#     required=True,
+#     help="Path to the dump eval results output directory",
+# )
 @click.option(
     "--num-examples",
     required=False,
@@ -44,36 +45,57 @@
 @click.pass_context
 def run_benchmark(
     ctx,
-    eval_task_ids: tuple[str, ...],
-    eval_task_config: str,
-    output_dir: str,
+    model_id: str,
+    benchmark_ids: tuple[str, ...],
+    # eval_task_config: str,
+    # output_dir: str,
     num_examples: Optional[int],
+    temperature,
+    max_tokens,
+    top_p,
+    repetition_penalty,
     visualize: bool,
 ):
     """Run a evaluation benchmark task"""
 
     client = ctx.obj["client"]
 
-    for eval_task_id in eval_task_ids:
-        eval_task = client.eval_tasks.retrieve(name=eval_task_id)
-        scoring_functions = eval_task.scoring_functions
-        dataset_id = eval_task.dataset_id
+    for benchmark_id in benchmark_ids:
+        benchmark = client.benchmarks.retrieve(benchmark_id=benchmark_id)
+        scoring_functions = benchmark.scoring_functions
+        dataset_id = benchmark.dataset_id
+
+        print("scoring_functions", scoring_functions)
+        print("dataset_id", dataset_id)
 
         rows = client.datasetio.get_rows_paginated(
-            dataset_id=dataset_id, rows_in_page=-1 if num_examples is None else num_examples
+            dataset_id=dataset_id,
+            rows_in_page=-1 if num_examples is None else num_examples,
         )
 
-        with open(eval_task_config, "r") as f:
-            eval_task_config = json.load(f)
+        # with open(eval_task_config, "r") as f:
+        #     eval_task_config = json.load(f)
 
         output_res = {}
 
         for r in tqdm(rows.rows):
-            eval_res = client.eval.evaluate_rows(
-                task_id=eval_task_id,
+            eval_res = client.eval.evaluate_rows_alpha(
+                benchmark_id=benchmark_id,
                 input_rows=[r],
                 scoring_functions=scoring_functions,
-                task_config=eval_task_config,
+                task_config={
+                    "type": "benchmark",
+                    "eval_candidate": {
+                        "type": "model",
+                        "model": model_id,
+                        "sampling_params": {
+                            "temperature": 0.0,
+                            "max_tokens": 4096,
+                            "top_p": 0.9,
+                            "repeat_penalty": 1.0,
+                        },
+                    },
+                },
             )
             for k in r.keys():
                 if k not in output_res:
@@ -90,20 +112,20 @@ def run_benchmark(
                     output_res[scoring_fn] = []
                 output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0])
 
-        # Create output directory if it doesn't exist
-        os.makedirs(output_dir, exist_ok=True)
-        # Save results to JSON file
-        output_file = os.path.join(output_dir, f"{eval_task_id}_results.json")
-        with open(output_file, "w") as f:
-            json.dump(output_res, f, indent=2)
-
-        rprint(f"[green]✓[/green] Results saved to: [blue]{output_file}[/blue]!\n")
-
-        if visualize:
-            for scoring_fn in scoring_functions:
-                res = output_res[scoring_fn]
-                assert len(res) > 0 and "score" in res[0]
-                scores = [str(r["score"]) for r in res]
-                unique_scores = sorted(list(set(scores)))
-                counts = [scores.count(s) for s in unique_scores]
-                create_bar_chart(counts, unique_scores, title=f"{scoring_fn}")
+        # # Create output directory if it doesn't exist
+        # os.makedirs(output_dir, exist_ok=True)
+        # # Save results to JSON file
+        # output_file = os.path.join(output_dir, f"{eval_task_id}_results.json")
+        # with open(output_file, "w") as f:
+        #     json.dump(output_res, f, indent=2)
+
+        # rprint(f"[green]✓[/green] Results saved to: [blue]{output_file}[/blue]!\n")
+
+        # if visualize:
+        #     for scoring_fn in scoring_functions:
+        #         res = output_res[scoring_fn]
+        #         assert len(res) > 0 and "score" in res[0]
+        #         scores = [str(r["score"]) for r in res]
+        #         unique_scores = sorted(list(set(scores)))
+        #         counts = [scores.count(s) for s in unique_scores]
+        #         create_bar_chart(counts, unique_scores, title=f"{scoring_fn}")

From a639585629adb36050fb642970a9ab27f542883d Mon Sep 17 00:00:00 2001
From: Botao Chen <markchen1015@meta.com>
Date: Thu, 20 Feb 2025 16:07:27 -0800
Subject: [PATCH 2/6] init commit

---
 .../lib/cli/eval/run_benchmark.py             | 168 +++++++++++++-----
 src/llama_stack_client/lib/cli/eval/utils.py  |  55 ++++++
 2 files changed, 178 insertions(+), 45 deletions(-)
 create mode 100644 src/llama_stack_client/lib/cli/eval/utils.py

diff --git a/src/llama_stack_client/lib/cli/eval/run_benchmark.py b/src/llama_stack_client/lib/cli/eval/run_benchmark.py
index 51841f28..ae7eb23a 100644
--- a/src/llama_stack_client/lib/cli/eval/run_benchmark.py
+++ b/src/llama_stack_client/lib/cli/eval/run_benchmark.py
@@ -13,29 +13,63 @@
 from tqdm.rich import tqdm
 
 from ..common.utils import create_bar_chart
+from .utils import (
+    aggregate_accuracy,
+    aggregate_average,
+    aggregate_categorical_count,
+    aggregate_median,
+)
 
 
 @click.command("run-benchmark")
-@click.argument("model_id", required=True)
 @click.argument("benchmark_ids", nargs=-1, required=True)
-# @click.option(
-#     "--eval-task-config",
-#     required=True,
-#     help="Path to the eval task config file in JSON format",
-#     type=click.Path(exists=True),
-# )
-# @click.option(
-#     "--output-dir",
-#     required=True,
-#     help="Path to the dump eval results output directory",
-# )
 @click.option(
-    "--num-examples",
+    "--model_id",
+    required=True,
+    help="model id to run the benchmark eval on",
+    default=None,
+    type=str,
+)
+@click.option(
+    "--output_dir",
+    required=True,
+    help="Path to the dump eval results output directory",
+)
+@click.option(
+    "--num_examples",
     required=False,
     help="Number of examples to evaluate on, useful for debugging",
     default=None,
     type=int,
 )
+@click.option(
+    "--temperature",
+    required=False,
+    help="temperature in the sampling params to run generation",
+    default=0.0,
+    type=float,
+)
+@click.option(
+    "--max_tokens",
+    required=False,
+    help="max_tokens in the sampling params to run generation",
+    default=4096,
+    type=int,
+)
+@click.option(
+    "--top_p",
+    required=False,
+    help="top_p in the sampling params to run generation",
+    default=0.9,
+    type=float,
+)
+@click.option(
+    "--repeat_penalty",
+    required=False,
+    help="repeat_penalty in the sampling params to run generation",
+    default=1.0,
+    type=float,
+)
 @click.option(
     "--visualize",
     is_flag=True,
@@ -45,15 +79,14 @@
 @click.pass_context
 def run_benchmark(
     ctx,
-    model_id: str,
     benchmark_ids: tuple[str, ...],
-    # eval_task_config: str,
-    # output_dir: str,
+    model_id: str,
+    output_dir: str,
     num_examples: Optional[int],
-    temperature,
-    max_tokens,
-    top_p,
-    repetition_penalty,
+    temperature: float,
+    max_tokens: int,
+    top_p: float,
+    repeat_penalty: float,
     visualize: bool,
 ):
     """Run a evaluation benchmark task"""
@@ -67,18 +100,20 @@ def run_benchmark(
 
         print("scoring_functions", scoring_functions)
         print("dataset_id", dataset_id)
+        print("model_id", model_id)
+        print("temperature", temperature)
+        print("max_tokens", max_tokens)
+        print("top_p", top_p)
+        print("repeat_penalty", repeat_penalty)
 
         rows = client.datasetio.get_rows_paginated(
             dataset_id=dataset_id,
             rows_in_page=-1 if num_examples is None else num_examples,
         )
 
-        # with open(eval_task_config, "r") as f:
-        #     eval_task_config = json.load(f)
-
         output_res = {}
 
-        for r in tqdm(rows.rows):
+        for i, r in enumerate(tqdm(rows.rows)):
             eval_res = client.eval.evaluate_rows_alpha(
                 benchmark_id=benchmark_id,
                 input_rows=[r],
@@ -89,10 +124,10 @@ def run_benchmark(
                         "type": "model",
                         "model": model_id,
                         "sampling_params": {
-                            "temperature": 0.0,
-                            "max_tokens": 4096,
-                            "top_p": 0.9,
-                            "repeat_penalty": 1.0,
+                            "temperature": temperature,
+                            "max_tokens": max_tokens,
+                            "top_p": top_p,
+                            "repeat_penalty": repeat_penalty,
                         },
                     },
                 },
@@ -112,20 +147,63 @@ def run_benchmark(
                     output_res[scoring_fn] = []
                 output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0])
 
-        # # Create output directory if it doesn't exist
-        # os.makedirs(output_dir, exist_ok=True)
-        # # Save results to JSON file
-        # output_file = os.path.join(output_dir, f"{eval_task_id}_results.json")
-        # with open(output_file, "w") as f:
-        #     json.dump(output_res, f, indent=2)
-
-        # rprint(f"[green]✓[/green] Results saved to: [blue]{output_file}[/blue]!\n")
-
-        # if visualize:
-        #     for scoring_fn in scoring_functions:
-        #         res = output_res[scoring_fn]
-        #         assert len(res) > 0 and "score" in res[0]
-        #         scores = [str(r["score"]) for r in res]
-        #         unique_scores = sorted(list(set(scores)))
-        #         counts = [scores.count(s) for s in unique_scores]
-        #         create_bar_chart(counts, unique_scores, title=f"{scoring_fn}")
+                aggregation_functions = client.scoring_functions.retrieve(
+                    scoring_fn_id=scoring_fn
+                ).params.aggregation_functions
+
+                # only output the aggregation result for the last row
+                if i == len(rows.rows) - 1:
+                    for aggregation_function in aggregation_functions:
+                        scoring_results = output_res[scoring_fn]
+                        if aggregation_function == "categorical_count":
+                            output_res[scoring_fn].append(
+                                aggregate_categorical_count(scoring_results)
+                            )
+                        elif aggregation_function == "average":
+                            output_res[scoring_fn].append(
+                                aggregate_average(scoring_results)
+                            )
+                        elif aggregation_function == "median":
+                            output_res[scoring_fn].append(
+                                aggregate_median(scoring_results)
+                            )
+                        elif aggregation_function == "accuracy":
+                            output_res[scoring_fn].append(
+                                aggregate_accuracy(scoring_results)
+                            )
+                        else:
+                            raise NotImplementedError(
+                                f"Aggregation function {aggregation_function} is not supported yet"
+                            )
+
+        # Create output directory if it doesn't exist
+        os.makedirs(output_dir, exist_ok=True)
+        # Save results to JSON file
+        output_file = os.path.join(output_dir, f"{benchmark_id}_results.json")
+        with open(output_file, "w") as f:
+            json.dump(output_res, f, indent=2)
+
+        rprint(f"[green]✓[/green] Results saved to: [blue]{output_file}[/blue]!\n")
+
+        if visualize:
+            for scoring_fn in scoring_functions:
+                aggregation_functions = client.scoring_functions.retrieve(
+                    scoring_fn_id=scoring_fn
+                ).params.aggregation_functions
+
+                for aggregation_function in aggregation_functions:
+                    res = output_res[scoring_fn]
+                    assert len(res) > 0 and "score" in res[0]
+                    if aggregation_function == "categorical_count":
+                        scores = [str(r["score"]) for r in res]
+                        unique_scores = sorted(list(set(scores)))
+                        counts = [scores.count(s) for s in unique_scores]
+                        create_bar_chart(
+                            counts,
+                            unique_scores,
+                            title=f"{scoring_fn}-{aggregation_function}",
+                        )
+                    else:
+                        raise NotImplementedError(
+                            f"Aggregation function {aggregation_function} ius not supported for visualization yet"
+                        )
diff --git a/src/llama_stack_client/lib/cli/eval/utils.py b/src/llama_stack_client/lib/cli/eval/utils.py
new file mode 100644
index 00000000..ec2e3efa
--- /dev/null
+++ b/src/llama_stack_client/lib/cli/eval/utils.py
@@ -0,0 +1,55 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, Dict, List, Union
+
+
+def aggregate_categorical_count(
+    scoring_results: List[
+        Dict[str, Union[bool, float, str, List[object], object, None]]
+    ],
+) -> Dict[str, Any]:
+    scores = [str(r["score"]) for r in scoring_results]
+    unique_scores = sorted(list(set(scores)))
+    return {"categorical_count": {s: scores.count(s) for s in unique_scores}}
+
+
+def aggregate_average(
+    scoring_results: List[
+        Dict[str, Union[bool, float, str, List[object], object, None]]
+    ],
+) -> Dict[str, Any]:
+    return {
+        "average": sum(
+            result["score"] for result in scoring_results if result["score"] is not None
+        )
+        / len([_ for _ in scoring_results if _["score"] is not None]),
+    }
+
+
+def aggregate_median(
+    scoring_results: List[
+        Dict[str, Union[bool, float, str, List[object], object, None]]
+    ],
+) -> Dict[str, Any]:
+    scores = [r["score"] for r in scoring_results if r["score"] is not None]
+    median = statistics.median(scores) if scores else None
+    return {"median": median}
+
+
+def aggregate_accuracy(
+    scoring_results: List[
+        Dict[str, Union[bool, float, str, List[object], object, None]]
+    ],
+) -> Dict[str, Any]:
+    num_correct = sum(result["score"] for result in scoring_results)
+    avg_score = num_correct / len(scoring_results)
+
+    return {
+        "accuracy": avg_score,
+        "num_correct": num_correct,
+        "num_total": len(scoring_results),
+    }

From b74e46af8dd693df19b72aaea03b14522223ba5f Mon Sep 17 00:00:00 2001
From: Botao Chen <markchen1015@meta.com>
Date: Thu, 20 Feb 2025 16:08:49 -0800
Subject: [PATCH 3/6] refine

---
 scripts/open_benchmark/simpleqa.py            | 81 -------------------
 .../lib/cli/eval/run_benchmark.py             |  8 --
 2 files changed, 89 deletions(-)
 delete mode 100644 scripts/open_benchmark/simpleqa.py

diff --git a/scripts/open_benchmark/simpleqa.py b/scripts/open_benchmark/simpleqa.py
deleted file mode 100644
index ee4ac03c..00000000
--- a/scripts/open_benchmark/simpleqa.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
-
-import asyncio
-from typing import Optional
-
-import fire
-from llama_stack_client import LlamaStackClient
-
-
-async def run_main(
-    host: str,
-    port: int,
-    model_id: str,
-    use_https: Optional[bool] = False,
-    cert_path: Optional[str] = None,
-):
-
-    # Construct the base URL with the appropriate protocol
-    protocol = "https" if use_https else "http"
-    base_url = f"{protocol}://{host}:{port}"
-
-    # Configure client with SSL certificate if provided
-    client_kwargs = {"base_url": base_url}
-    if use_https and cert_path:
-        client_kwargs["verify"] = cert_path
-
-    client = LlamaStackClient(**client_kwargs)
-
-    eval_rows = client.datasetio.get_rows_paginated(
-        dataset_id="simpleqa",
-        rows_in_page=5,
-    )
-
-    response = client.eval.evaluate_rows_alpha(
-        benchmark_id="meta-reference-simpleqa",
-        input_rows=eval_rows.rows,
-        scoring_functions=["llm-as-judge::405b-simpleqa"],
-        task_config={
-            "type": "benchmark",
-            "eval_candidate": {
-                "type": "model",
-                "model": model_id,
-                "sampling_params": {
-                    "temperature": 0.0,
-                    "max_tokens": 4096,
-                    "top_p": 0.9,
-                    "repeat_penalty": 1.0,
-                },
-            },
-        },
-    )
-
-    print(response)
-
-
-def main(
-    host: str,
-    port: int,
-    model: str,
-    use_https: Optional[bool] = False,
-    cert_path: Optional[str] = None,
-):
-    asyncio.run(
-        run_main(
-            host,
-            port,
-            model,
-            use_https,
-            cert_path,
-        )
-    )
-
-
-if __name__ == "__main__":
-    fire.Fire(main)
diff --git a/src/llama_stack_client/lib/cli/eval/run_benchmark.py b/src/llama_stack_client/lib/cli/eval/run_benchmark.py
index ae7eb23a..9ac3c889 100644
--- a/src/llama_stack_client/lib/cli/eval/run_benchmark.py
+++ b/src/llama_stack_client/lib/cli/eval/run_benchmark.py
@@ -98,14 +98,6 @@ def run_benchmark(
         scoring_functions = benchmark.scoring_functions
         dataset_id = benchmark.dataset_id
 
-        print("scoring_functions", scoring_functions)
-        print("dataset_id", dataset_id)
-        print("model_id", model_id)
-        print("temperature", temperature)
-        print("max_tokens", max_tokens)
-        print("top_p", top_p)
-        print("repeat_penalty", repeat_penalty)
-
         rows = client.datasetio.get_rows_paginated(
             dataset_id=dataset_id,
             rows_in_page=-1 if num_examples is None else num_examples,

From 31296fb9be0c96222b88c8bf413ef8f1855b0376 Mon Sep 17 00:00:00 2001
From: Botao Chen <markchen1015@meta.com>
Date: Thu, 20 Feb 2025 16:22:40 -0800
Subject: [PATCH 4/6] pre-commit

---
 .../lib/cli/eval/run_benchmark.py             | 16 ++++-----------
 src/llama_stack_client/lib/cli/eval/utils.py  | 20 +++++--------------
 2 files changed, 9 insertions(+), 27 deletions(-)

diff --git a/src/llama_stack_client/lib/cli/eval/run_benchmark.py b/src/llama_stack_client/lib/cli/eval/run_benchmark.py
index 9ac3c889..4de1d340 100644
--- a/src/llama_stack_client/lib/cli/eval/run_benchmark.py
+++ b/src/llama_stack_client/lib/cli/eval/run_benchmark.py
@@ -148,21 +148,13 @@ def run_benchmark(
                     for aggregation_function in aggregation_functions:
                         scoring_results = output_res[scoring_fn]
                         if aggregation_function == "categorical_count":
-                            output_res[scoring_fn].append(
-                                aggregate_categorical_count(scoring_results)
-                            )
+                            output_res[scoring_fn].append(aggregate_categorical_count(scoring_results))
                         elif aggregation_function == "average":
-                            output_res[scoring_fn].append(
-                                aggregate_average(scoring_results)
-                            )
+                            output_res[scoring_fn].append(aggregate_average(scoring_results))
                         elif aggregation_function == "median":
-                            output_res[scoring_fn].append(
-                                aggregate_median(scoring_results)
-                            )
+                            output_res[scoring_fn].append(aggregate_median(scoring_results))
                         elif aggregation_function == "accuracy":
-                            output_res[scoring_fn].append(
-                                aggregate_accuracy(scoring_results)
-                            )
+                            output_res[scoring_fn].append(aggregate_accuracy(scoring_results))
                         else:
                             raise NotImplementedError(
                                 f"Aggregation function {aggregation_function} is not supported yet"
diff --git a/src/llama_stack_client/lib/cli/eval/utils.py b/src/llama_stack_client/lib/cli/eval/utils.py
index ec2e3efa..102b8817 100644
--- a/src/llama_stack_client/lib/cli/eval/utils.py
+++ b/src/llama_stack_client/lib/cli/eval/utils.py
@@ -8,9 +8,7 @@
 
 
 def aggregate_categorical_count(
-    scoring_results: List[
-        Dict[str, Union[bool, float, str, List[object], object, None]]
-    ],
+    scoring_results: List[Dict[str, Union[bool, float, str, List[object], object, None]]],
 ) -> Dict[str, Any]:
     scores = [str(r["score"]) for r in scoring_results]
     unique_scores = sorted(list(set(scores)))
@@ -18,22 +16,16 @@ def aggregate_categorical_count(
 
 
 def aggregate_average(
-    scoring_results: List[
-        Dict[str, Union[bool, float, str, List[object], object, None]]
-    ],
+    scoring_results: List[Dict[str, Union[bool, float, str, List[object], object, None]]],
 ) -> Dict[str, Any]:
     return {
-        "average": sum(
-            result["score"] for result in scoring_results if result["score"] is not None
-        )
+        "average": sum(result["score"] for result in scoring_results if result["score"] is not None)
         / len([_ for _ in scoring_results if _["score"] is not None]),
     }
 
 
 def aggregate_median(
-    scoring_results: List[
-        Dict[str, Union[bool, float, str, List[object], object, None]]
-    ],
+    scoring_results: List[Dict[str, Union[bool, float, str, List[object], object, None]]],
 ) -> Dict[str, Any]:
     scores = [r["score"] for r in scoring_results if r["score"] is not None]
     median = statistics.median(scores) if scores else None
@@ -41,9 +33,7 @@ def aggregate_median(
 
 
 def aggregate_accuracy(
-    scoring_results: List[
-        Dict[str, Union[bool, float, str, List[object], object, None]]
-    ],
+    scoring_results: List[Dict[str, Union[bool, float, str, List[object], object, None]]],
 ) -> Dict[str, Any]:
     num_correct = sum(result["score"] for result in scoring_results)
     avg_score = num_correct / len(scoring_results)

From 9324dfe08023fcc3e556aa0a9ac33450ca48b66f Mon Sep 17 00:00:00 2001
From: Botao Chen <markchen1015@meta.com>
Date: Thu, 20 Feb 2025 17:12:31 -0800
Subject: [PATCH 5/6] address comment

---
 .../lib/cli/eval/run_benchmark.py              | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/llama_stack_client/lib/cli/eval/run_benchmark.py b/src/llama_stack_client/lib/cli/eval/run_benchmark.py
index 4de1d340..f6eab9c8 100644
--- a/src/llama_stack_client/lib/cli/eval/run_benchmark.py
+++ b/src/llama_stack_client/lib/cli/eval/run_benchmark.py
@@ -24,19 +24,19 @@
 @click.command("run-benchmark")
 @click.argument("benchmark_ids", nargs=-1, required=True)
 @click.option(
-    "--model_id",
+    "--model-id",
     required=True,
     help="model id to run the benchmark eval on",
     default=None,
     type=str,
 )
 @click.option(
-    "--output_dir",
+    "--output-dir",
     required=True,
     help="Path to the dump eval results output directory",
 )
 @click.option(
-    "--num_examples",
+    "--num-examples",
     required=False,
     help="Number of examples to evaluate on, useful for debugging",
     default=None,
@@ -50,23 +50,23 @@
     type=float,
 )
 @click.option(
-    "--max_tokens",
+    "--max-tokens",
     required=False,
-    help="max_tokens in the sampling params to run generation",
+    help="max-tokens in the sampling params to run generation",
     default=4096,
     type=int,
 )
 @click.option(
-    "--top_p",
+    "--top-p",
     required=False,
-    help="top_p in the sampling params to run generation",
+    help="top-p in the sampling params to run generation",
     default=0.9,
     type=float,
 )
 @click.option(
-    "--repeat_penalty",
+    "--repeat-penalty",
     required=False,
-    help="repeat_penalty in the sampling params to run generation",
+    help="repeat-penalty in the sampling params to run generation",
     default=1.0,
     type=float,
 )

From d708ef52d3734079448ec357c21d83c055e9e516 Mon Sep 17 00:00:00 2001
From: Botao Chen <markchen1015@meta.com>
Date: Thu, 20 Feb 2025 17:13:28 -0800
Subject: [PATCH 6/6] address comment

---
 src/llama_stack_client/lib/cli/eval/run_benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama_stack_client/lib/cli/eval/run_benchmark.py b/src/llama_stack_client/lib/cli/eval/run_benchmark.py
index f6eab9c8..933b1338 100644
--- a/src/llama_stack_client/lib/cli/eval/run_benchmark.py
+++ b/src/llama_stack_client/lib/cli/eval/run_benchmark.py
@@ -22,7 +22,7 @@
 
 
 @click.command("run-benchmark")
-@click.argument("benchmark_ids", nargs=-1, required=True)
+@click.argument("benchmark-ids", nargs=-1, required=True)
 @click.option(
     "--model-id",
     required=True,