From 7b9bbad0826f71f3004d5e58cdcd4404ac0294d9 Mon Sep 17 00:00:00 2001 From: Botao Chen Date: Wed, 19 Feb 2025 23:17:09 -0800 Subject: [PATCH 1/6] temp commit --- scripts/open_benchmark/simpleqa.py | 81 +++++++++++++ .../lib/cli/eval/run_benchmark.py | 106 +++++++++++------- 2 files changed, 145 insertions(+), 42 deletions(-) create mode 100644 scripts/open_benchmark/simpleqa.py diff --git a/scripts/open_benchmark/simpleqa.py b/scripts/open_benchmark/simpleqa.py new file mode 100644 index 00000000..ee4ac03c --- /dev/null +++ b/scripts/open_benchmark/simpleqa.py @@ -0,0 +1,81 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +# Copyright (c) Meta Platforms, Inc. and affiliates. +# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement. + +import asyncio +from typing import Optional + +import fire +from llama_stack_client import LlamaStackClient + + +async def run_main( + host: str, + port: int, + model_id: str, + use_https: Optional[bool] = False, + cert_path: Optional[str] = None, +): + + # Construct the base URL with the appropriate protocol + protocol = "https" if use_https else "http" + base_url = f"{protocol}://{host}:{port}" + + # Configure client with SSL certificate if provided + client_kwargs = {"base_url": base_url} + if use_https and cert_path: + client_kwargs["verify"] = cert_path + + client = LlamaStackClient(**client_kwargs) + + eval_rows = client.datasetio.get_rows_paginated( + dataset_id="simpleqa", + rows_in_page=5, + ) + + response = client.eval.evaluate_rows_alpha( + benchmark_id="meta-reference-simpleqa", + input_rows=eval_rows.rows, + scoring_functions=["llm-as-judge::405b-simpleqa"], + task_config={ + "type": "benchmark", + "eval_candidate": { + "type": "model", + "model": model_id, + "sampling_params": { + "temperature": 0.0, + "max_tokens": 4096, + "top_p": 0.9, + "repeat_penalty": 1.0, + }, + }, + }, + ) + + print(response) + + +def main( + host: str, + port: int, + model: str, + use_https: Optional[bool] = False, + cert_path: Optional[str] = None, +): + asyncio.run( + run_main( + host, + port, + model, + use_https, + cert_path, + ) + ) + + +if __name__ == "__main__": + fire.Fire(main) diff --git a/src/llama_stack_client/lib/cli/eval/run_benchmark.py b/src/llama_stack_client/lib/cli/eval/run_benchmark.py index ac03c564..51841f28 100644 --- a/src/llama_stack_client/lib/cli/eval/run_benchmark.py +++ b/src/llama_stack_client/lib/cli/eval/run_benchmark.py @@ -16,18 +16,19 @@ @click.command("run-benchmark") -@click.argument("eval-task-ids", nargs=-1, required=True) -@click.option( - "--eval-task-config", - required=True, - help="Path to the eval task config file in JSON format", - type=click.Path(exists=True), -) -@click.option( - "--output-dir", - required=True, - help="Path to the dump eval results output directory", -) +@click.argument("model_id", required=True) +@click.argument("benchmark_ids", nargs=-1, required=True) +# @click.option( +# "--eval-task-config", +# required=True, +# help="Path to the eval task config file in JSON format", +# type=click.Path(exists=True), +# ) +# @click.option( +# "--output-dir", +# required=True, +# help="Path to the dump eval results output directory", +# ) @click.option( "--num-examples", required=False, @@ -44,36 +45,57 @@ @click.pass_context def run_benchmark( ctx, - eval_task_ids: tuple[str, ...], - eval_task_config: str, - output_dir: str, + model_id: str, + benchmark_ids: tuple[str, ...], + # eval_task_config: str, + # output_dir: str, num_examples: Optional[int], + temperature, + max_tokens, + top_p, + repetition_penalty, visualize: bool, ): """Run a evaluation benchmark task""" client = ctx.obj["client"] - for eval_task_id in eval_task_ids: - eval_task = client.eval_tasks.retrieve(name=eval_task_id) - scoring_functions = eval_task.scoring_functions - dataset_id = eval_task.dataset_id + for benchmark_id in benchmark_ids: + benchmark = client.benchmarks.retrieve(benchmark_id=benchmark_id) + scoring_functions = benchmark.scoring_functions + dataset_id = benchmark.dataset_id + + print("scoring_functions", scoring_functions) + print("dataset_id", dataset_id) rows = client.datasetio.get_rows_paginated( - dataset_id=dataset_id, rows_in_page=-1 if num_examples is None else num_examples + dataset_id=dataset_id, + rows_in_page=-1 if num_examples is None else num_examples, ) - with open(eval_task_config, "r") as f: - eval_task_config = json.load(f) + # with open(eval_task_config, "r") as f: + # eval_task_config = json.load(f) output_res = {} for r in tqdm(rows.rows): - eval_res = client.eval.evaluate_rows( - task_id=eval_task_id, + eval_res = client.eval.evaluate_rows_alpha( + benchmark_id=benchmark_id, input_rows=[r], scoring_functions=scoring_functions, - task_config=eval_task_config, + task_config={ + "type": "benchmark", + "eval_candidate": { + "type": "model", + "model": model_id, + "sampling_params": { + "temperature": 0.0, + "max_tokens": 4096, + "top_p": 0.9, + "repeat_penalty": 1.0, + }, + }, + }, ) for k in r.keys(): if k not in output_res: @@ -90,20 +112,20 @@ def run_benchmark( output_res[scoring_fn] = [] output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0]) - # Create output directory if it doesn't exist - os.makedirs(output_dir, exist_ok=True) - # Save results to JSON file - output_file = os.path.join(output_dir, f"{eval_task_id}_results.json") - with open(output_file, "w") as f: - json.dump(output_res, f, indent=2) - - rprint(f"[green]✓[/green] Results saved to: [blue]{output_file}[/blue]!\n") - - if visualize: - for scoring_fn in scoring_functions: - res = output_res[scoring_fn] - assert len(res) > 0 and "score" in res[0] - scores = [str(r["score"]) for r in res] - unique_scores = sorted(list(set(scores))) - counts = [scores.count(s) for s in unique_scores] - create_bar_chart(counts, unique_scores, title=f"{scoring_fn}") + # # Create output directory if it doesn't exist + # os.makedirs(output_dir, exist_ok=True) + # # Save results to JSON file + # output_file = os.path.join(output_dir, f"{eval_task_id}_results.json") + # with open(output_file, "w") as f: + # json.dump(output_res, f, indent=2) + + # rprint(f"[green]✓[/green] Results saved to: [blue]{output_file}[/blue]!\n") + + # if visualize: + # for scoring_fn in scoring_functions: + # res = output_res[scoring_fn] + # assert len(res) > 0 and "score" in res[0] + # scores = [str(r["score"]) for r in res] + # unique_scores = sorted(list(set(scores))) + # counts = [scores.count(s) for s in unique_scores] + # create_bar_chart(counts, unique_scores, title=f"{scoring_fn}") From a639585629adb36050fb642970a9ab27f542883d Mon Sep 17 00:00:00 2001 From: Botao Chen Date: Thu, 20 Feb 2025 16:07:27 -0800 Subject: [PATCH 2/6] init commit --- .../lib/cli/eval/run_benchmark.py | 168 +++++++++++++----- src/llama_stack_client/lib/cli/eval/utils.py | 55 ++++++ 2 files changed, 178 insertions(+), 45 deletions(-) create mode 100644 src/llama_stack_client/lib/cli/eval/utils.py diff --git a/src/llama_stack_client/lib/cli/eval/run_benchmark.py b/src/llama_stack_client/lib/cli/eval/run_benchmark.py index 51841f28..ae7eb23a 100644 --- a/src/llama_stack_client/lib/cli/eval/run_benchmark.py +++ b/src/llama_stack_client/lib/cli/eval/run_benchmark.py @@ -13,29 +13,63 @@ from tqdm.rich import tqdm from ..common.utils import create_bar_chart +from .utils import ( + aggregate_accuracy, + aggregate_average, + aggregate_categorical_count, + aggregate_median, +) @click.command("run-benchmark") -@click.argument("model_id", required=True) @click.argument("benchmark_ids", nargs=-1, required=True) -# @click.option( -# "--eval-task-config", -# required=True, -# help="Path to the eval task config file in JSON format", -# type=click.Path(exists=True), -# ) -# @click.option( -# "--output-dir", -# required=True, -# help="Path to the dump eval results output directory", -# ) @click.option( - "--num-examples", + "--model_id", + required=True, + help="model id to run the benchmark eval on", + default=None, + type=str, +) +@click.option( + "--output_dir", + required=True, + help="Path to the dump eval results output directory", +) +@click.option( + "--num_examples", required=False, help="Number of examples to evaluate on, useful for debugging", default=None, type=int, ) +@click.option( + "--temperature", + required=False, + help="temperature in the sampling params to run generation", + default=0.0, + type=float, +) +@click.option( + "--max_tokens", + required=False, + help="max_tokens in the sampling params to run generation", + default=4096, + type=int, +) +@click.option( + "--top_p", + required=False, + help="top_p in the sampling params to run generation", + default=0.9, + type=float, +) +@click.option( + "--repeat_penalty", + required=False, + help="repeat_penalty in the sampling params to run generation", + default=1.0, + type=float, +) @click.option( "--visualize", is_flag=True, @@ -45,15 +79,14 @@ @click.pass_context def run_benchmark( ctx, - model_id: str, benchmark_ids: tuple[str, ...], - # eval_task_config: str, - # output_dir: str, + model_id: str, + output_dir: str, num_examples: Optional[int], - temperature, - max_tokens, - top_p, - repetition_penalty, + temperature: float, + max_tokens: int, + top_p: float, + repeat_penalty: float, visualize: bool, ): """Run a evaluation benchmark task""" @@ -67,18 +100,20 @@ def run_benchmark( print("scoring_functions", scoring_functions) print("dataset_id", dataset_id) + print("model_id", model_id) + print("temperature", temperature) + print("max_tokens", max_tokens) + print("top_p", top_p) + print("repeat_penalty", repeat_penalty) rows = client.datasetio.get_rows_paginated( dataset_id=dataset_id, rows_in_page=-1 if num_examples is None else num_examples, ) - # with open(eval_task_config, "r") as f: - # eval_task_config = json.load(f) - output_res = {} - for r in tqdm(rows.rows): + for i, r in enumerate(tqdm(rows.rows)): eval_res = client.eval.evaluate_rows_alpha( benchmark_id=benchmark_id, input_rows=[r], @@ -89,10 +124,10 @@ def run_benchmark( "type": "model", "model": model_id, "sampling_params": { - "temperature": 0.0, - "max_tokens": 4096, - "top_p": 0.9, - "repeat_penalty": 1.0, + "temperature": temperature, + "max_tokens": max_tokens, + "top_p": top_p, + "repeat_penalty": repeat_penalty, }, }, }, @@ -112,20 +147,63 @@ def run_benchmark( output_res[scoring_fn] = [] output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0]) - # # Create output directory if it doesn't exist - # os.makedirs(output_dir, exist_ok=True) - # # Save results to JSON file - # output_file = os.path.join(output_dir, f"{eval_task_id}_results.json") - # with open(output_file, "w") as f: - # json.dump(output_res, f, indent=2) - - # rprint(f"[green]✓[/green] Results saved to: [blue]{output_file}[/blue]!\n") - - # if visualize: - # for scoring_fn in scoring_functions: - # res = output_res[scoring_fn] - # assert len(res) > 0 and "score" in res[0] - # scores = [str(r["score"]) for r in res] - # unique_scores = sorted(list(set(scores))) - # counts = [scores.count(s) for s in unique_scores] - # create_bar_chart(counts, unique_scores, title=f"{scoring_fn}") + aggregation_functions = client.scoring_functions.retrieve( + scoring_fn_id=scoring_fn + ).params.aggregation_functions + + # only output the aggregation result for the last row + if i == len(rows.rows) - 1: + for aggregation_function in aggregation_functions: + scoring_results = output_res[scoring_fn] + if aggregation_function == "categorical_count": + output_res[scoring_fn].append( + aggregate_categorical_count(scoring_results) + ) + elif aggregation_function == "average": + output_res[scoring_fn].append( + aggregate_average(scoring_results) + ) + elif aggregation_function == "median": + output_res[scoring_fn].append( + aggregate_median(scoring_results) + ) + elif aggregation_function == "accuracy": + output_res[scoring_fn].append( + aggregate_accuracy(scoring_results) + ) + else: + raise NotImplementedError( + f"Aggregation function {aggregation_function} is not supported yet" + ) + + # Create output directory if it doesn't exist + os.makedirs(output_dir, exist_ok=True) + # Save results to JSON file + output_file = os.path.join(output_dir, f"{benchmark_id}_results.json") + with open(output_file, "w") as f: + json.dump(output_res, f, indent=2) + + rprint(f"[green]✓[/green] Results saved to: [blue]{output_file}[/blue]!\n") + + if visualize: + for scoring_fn in scoring_functions: + aggregation_functions = client.scoring_functions.retrieve( + scoring_fn_id=scoring_fn + ).params.aggregation_functions + + for aggregation_function in aggregation_functions: + res = output_res[scoring_fn] + assert len(res) > 0 and "score" in res[0] + if aggregation_function == "categorical_count": + scores = [str(r["score"]) for r in res] + unique_scores = sorted(list(set(scores))) + counts = [scores.count(s) for s in unique_scores] + create_bar_chart( + counts, + unique_scores, + title=f"{scoring_fn}-{aggregation_function}", + ) + else: + raise NotImplementedError( + f"Aggregation function {aggregation_function} ius not supported for visualization yet" + ) diff --git a/src/llama_stack_client/lib/cli/eval/utils.py b/src/llama_stack_client/lib/cli/eval/utils.py new file mode 100644 index 00000000..ec2e3efa --- /dev/null +++ b/src/llama_stack_client/lib/cli/eval/utils.py @@ -0,0 +1,55 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any, Dict, List, Union + + +def aggregate_categorical_count( + scoring_results: List[ + Dict[str, Union[bool, float, str, List[object], object, None]] + ], +) -> Dict[str, Any]: + scores = [str(r["score"]) for r in scoring_results] + unique_scores = sorted(list(set(scores))) + return {"categorical_count": {s: scores.count(s) for s in unique_scores}} + + +def aggregate_average( + scoring_results: List[ + Dict[str, Union[bool, float, str, List[object], object, None]] + ], +) -> Dict[str, Any]: + return { + "average": sum( + result["score"] for result in scoring_results if result["score"] is not None + ) + / len([_ for _ in scoring_results if _["score"] is not None]), + } + + +def aggregate_median( + scoring_results: List[ + Dict[str, Union[bool, float, str, List[object], object, None]] + ], +) -> Dict[str, Any]: + scores = [r["score"] for r in scoring_results if r["score"] is not None] + median = statistics.median(scores) if scores else None + return {"median": median} + + +def aggregate_accuracy( + scoring_results: List[ + Dict[str, Union[bool, float, str, List[object], object, None]] + ], +) -> Dict[str, Any]: + num_correct = sum(result["score"] for result in scoring_results) + avg_score = num_correct / len(scoring_results) + + return { + "accuracy": avg_score, + "num_correct": num_correct, + "num_total": len(scoring_results), + } From b74e46af8dd693df19b72aaea03b14522223ba5f Mon Sep 17 00:00:00 2001 From: Botao Chen Date: Thu, 20 Feb 2025 16:08:49 -0800 Subject: [PATCH 3/6] refine --- scripts/open_benchmark/simpleqa.py | 81 ------------------- .../lib/cli/eval/run_benchmark.py | 8 -- 2 files changed, 89 deletions(-) delete mode 100644 scripts/open_benchmark/simpleqa.py diff --git a/scripts/open_benchmark/simpleqa.py b/scripts/open_benchmark/simpleqa.py deleted file mode 100644 index ee4ac03c..00000000 --- a/scripts/open_benchmark/simpleqa.py +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. -# Copyright (c) Meta Platforms, Inc. and affiliates. -# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement. - -import asyncio -from typing import Optional - -import fire -from llama_stack_client import LlamaStackClient - - -async def run_main( - host: str, - port: int, - model_id: str, - use_https: Optional[bool] = False, - cert_path: Optional[str] = None, -): - - # Construct the base URL with the appropriate protocol - protocol = "https" if use_https else "http" - base_url = f"{protocol}://{host}:{port}" - - # Configure client with SSL certificate if provided - client_kwargs = {"base_url": base_url} - if use_https and cert_path: - client_kwargs["verify"] = cert_path - - client = LlamaStackClient(**client_kwargs) - - eval_rows = client.datasetio.get_rows_paginated( - dataset_id="simpleqa", - rows_in_page=5, - ) - - response = client.eval.evaluate_rows_alpha( - benchmark_id="meta-reference-simpleqa", - input_rows=eval_rows.rows, - scoring_functions=["llm-as-judge::405b-simpleqa"], - task_config={ - "type": "benchmark", - "eval_candidate": { - "type": "model", - "model": model_id, - "sampling_params": { - "temperature": 0.0, - "max_tokens": 4096, - "top_p": 0.9, - "repeat_penalty": 1.0, - }, - }, - }, - ) - - print(response) - - -def main( - host: str, - port: int, - model: str, - use_https: Optional[bool] = False, - cert_path: Optional[str] = None, -): - asyncio.run( - run_main( - host, - port, - model, - use_https, - cert_path, - ) - ) - - -if __name__ == "__main__": - fire.Fire(main) diff --git a/src/llama_stack_client/lib/cli/eval/run_benchmark.py b/src/llama_stack_client/lib/cli/eval/run_benchmark.py index ae7eb23a..9ac3c889 100644 --- a/src/llama_stack_client/lib/cli/eval/run_benchmark.py +++ b/src/llama_stack_client/lib/cli/eval/run_benchmark.py @@ -98,14 +98,6 @@ def run_benchmark( scoring_functions = benchmark.scoring_functions dataset_id = benchmark.dataset_id - print("scoring_functions", scoring_functions) - print("dataset_id", dataset_id) - print("model_id", model_id) - print("temperature", temperature) - print("max_tokens", max_tokens) - print("top_p", top_p) - print("repeat_penalty", repeat_penalty) - rows = client.datasetio.get_rows_paginated( dataset_id=dataset_id, rows_in_page=-1 if num_examples is None else num_examples, From 31296fb9be0c96222b88c8bf413ef8f1855b0376 Mon Sep 17 00:00:00 2001 From: Botao Chen Date: Thu, 20 Feb 2025 16:22:40 -0800 Subject: [PATCH 4/6] pre-commit --- .../lib/cli/eval/run_benchmark.py | 16 ++++----------- src/llama_stack_client/lib/cli/eval/utils.py | 20 +++++-------------- 2 files changed, 9 insertions(+), 27 deletions(-) diff --git a/src/llama_stack_client/lib/cli/eval/run_benchmark.py b/src/llama_stack_client/lib/cli/eval/run_benchmark.py index 9ac3c889..4de1d340 100644 --- a/src/llama_stack_client/lib/cli/eval/run_benchmark.py +++ b/src/llama_stack_client/lib/cli/eval/run_benchmark.py @@ -148,21 +148,13 @@ def run_benchmark( for aggregation_function in aggregation_functions: scoring_results = output_res[scoring_fn] if aggregation_function == "categorical_count": - output_res[scoring_fn].append( - aggregate_categorical_count(scoring_results) - ) + output_res[scoring_fn].append(aggregate_categorical_count(scoring_results)) elif aggregation_function == "average": - output_res[scoring_fn].append( - aggregate_average(scoring_results) - ) + output_res[scoring_fn].append(aggregate_average(scoring_results)) elif aggregation_function == "median": - output_res[scoring_fn].append( - aggregate_median(scoring_results) - ) + output_res[scoring_fn].append(aggregate_median(scoring_results)) elif aggregation_function == "accuracy": - output_res[scoring_fn].append( - aggregate_accuracy(scoring_results) - ) + output_res[scoring_fn].append(aggregate_accuracy(scoring_results)) else: raise NotImplementedError( f"Aggregation function {aggregation_function} is not supported yet" diff --git a/src/llama_stack_client/lib/cli/eval/utils.py b/src/llama_stack_client/lib/cli/eval/utils.py index ec2e3efa..102b8817 100644 --- a/src/llama_stack_client/lib/cli/eval/utils.py +++ b/src/llama_stack_client/lib/cli/eval/utils.py @@ -8,9 +8,7 @@ def aggregate_categorical_count( - scoring_results: List[ - Dict[str, Union[bool, float, str, List[object], object, None]] - ], + scoring_results: List[Dict[str, Union[bool, float, str, List[object], object, None]]], ) -> Dict[str, Any]: scores = [str(r["score"]) for r in scoring_results] unique_scores = sorted(list(set(scores))) @@ -18,22 +16,16 @@ def aggregate_categorical_count( def aggregate_average( - scoring_results: List[ - Dict[str, Union[bool, float, str, List[object], object, None]] - ], + scoring_results: List[Dict[str, Union[bool, float, str, List[object], object, None]]], ) -> Dict[str, Any]: return { - "average": sum( - result["score"] for result in scoring_results if result["score"] is not None - ) + "average": sum(result["score"] for result in scoring_results if result["score"] is not None) / len([_ for _ in scoring_results if _["score"] is not None]), } def aggregate_median( - scoring_results: List[ - Dict[str, Union[bool, float, str, List[object], object, None]] - ], + scoring_results: List[Dict[str, Union[bool, float, str, List[object], object, None]]], ) -> Dict[str, Any]: scores = [r["score"] for r in scoring_results if r["score"] is not None] median = statistics.median(scores) if scores else None @@ -41,9 +33,7 @@ def aggregate_median( def aggregate_accuracy( - scoring_results: List[ - Dict[str, Union[bool, float, str, List[object], object, None]] - ], + scoring_results: List[Dict[str, Union[bool, float, str, List[object], object, None]]], ) -> Dict[str, Any]: num_correct = sum(result["score"] for result in scoring_results) avg_score = num_correct / len(scoring_results) From 9324dfe08023fcc3e556aa0a9ac33450ca48b66f Mon Sep 17 00:00:00 2001 From: Botao Chen Date: Thu, 20 Feb 2025 17:12:31 -0800 Subject: [PATCH 5/6] address comment --- .../lib/cli/eval/run_benchmark.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/llama_stack_client/lib/cli/eval/run_benchmark.py b/src/llama_stack_client/lib/cli/eval/run_benchmark.py index 4de1d340..f6eab9c8 100644 --- a/src/llama_stack_client/lib/cli/eval/run_benchmark.py +++ b/src/llama_stack_client/lib/cli/eval/run_benchmark.py @@ -24,19 +24,19 @@ @click.command("run-benchmark") @click.argument("benchmark_ids", nargs=-1, required=True) @click.option( - "--model_id", + "--model-id", required=True, help="model id to run the benchmark eval on", default=None, type=str, ) @click.option( - "--output_dir", + "--output-dir", required=True, help="Path to the dump eval results output directory", ) @click.option( - "--num_examples", + "--num-examples", required=False, help="Number of examples to evaluate on, useful for debugging", default=None, @@ -50,23 +50,23 @@ type=float, ) @click.option( - "--max_tokens", + "--max-tokens", required=False, - help="max_tokens in the sampling params to run generation", + help="max-tokens in the sampling params to run generation", default=4096, type=int, ) @click.option( - "--top_p", + "--top-p", required=False, - help="top_p in the sampling params to run generation", + help="top-p in the sampling params to run generation", default=0.9, type=float, ) @click.option( - "--repeat_penalty", + "--repeat-penalty", required=False, - help="repeat_penalty in the sampling params to run generation", + help="repeat-penalty in the sampling params to run generation", default=1.0, type=float, ) From d708ef52d3734079448ec357c21d83c055e9e516 Mon Sep 17 00:00:00 2001 From: Botao Chen Date: Thu, 20 Feb 2025 17:13:28 -0800 Subject: [PATCH 6/6] address comment --- src/llama_stack_client/lib/cli/eval/run_benchmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama_stack_client/lib/cli/eval/run_benchmark.py b/src/llama_stack_client/lib/cli/eval/run_benchmark.py index f6eab9c8..933b1338 100644 --- a/src/llama_stack_client/lib/cli/eval/run_benchmark.py +++ b/src/llama_stack_client/lib/cli/eval/run_benchmark.py @@ -22,7 +22,7 @@ @click.command("run-benchmark") -@click.argument("benchmark_ids", nargs=-1, required=True) +@click.argument("benchmark-ids", nargs=-1, required=True) @click.option( "--model-id", required=True,