From 1a7eddfd91df17d56782e47a13017e92636a0bee Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Thu, 14 Nov 2024 11:23:46 -0500 Subject: [PATCH 1/6] wip --- .../lib/cli/eval/__init__.py | 9 +++ src/llama_stack_client/lib/cli/eval/eval.py | 20 +++++ .../lib/cli/eval/run_benchmark.py | 74 +++++++++++++++++++ .../lib/cli/llama_stack_client.py | 12 ++- src/llama_stack_client/lib/cli/subcommand.py | 19 ----- 5 files changed, 114 insertions(+), 20 deletions(-) create mode 100644 src/llama_stack_client/lib/cli/eval/__init__.py create mode 100644 src/llama_stack_client/lib/cli/eval/eval.py create mode 100644 src/llama_stack_client/lib/cli/eval/run_benchmark.py delete mode 100644 src/llama_stack_client/lib/cli/subcommand.py diff --git a/src/llama_stack_client/lib/cli/eval/__init__.py b/src/llama_stack_client/lib/cli/eval/__init__.py new file mode 100644 index 00000000..503994e9 --- /dev/null +++ b/src/llama_stack_client/lib/cli/eval/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .eval import eval + +__all__ = ["eval"] diff --git a/src/llama_stack_client/lib/cli/eval/eval.py b/src/llama_stack_client/lib/cli/eval/eval.py new file mode 100644 index 00000000..6ff9aa2b --- /dev/null +++ b/src/llama_stack_client/lib/cli/eval/eval.py @@ -0,0 +1,20 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + + +import click + +from .run_benchmark import run_benchmark + + +@click.group() +def eval(): + """Run evaluation tasks""" + pass + + +# Register subcommands +eval.add_command(run_benchmark) diff --git a/src/llama_stack_client/lib/cli/eval/run_benchmark.py b/src/llama_stack_client/lib/cli/eval/run_benchmark.py new file mode 100644 index 00000000..bbee84bf --- /dev/null +++ b/src/llama_stack_client/lib/cli/eval/run_benchmark.py @@ -0,0 +1,74 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Optional + +import click + +# from rich.console import Console +# from rich.table import Table +from tqdm.rich import tqdm + + +@click.command("run_benchmark") +@click.option("--eval-task-id", required=True, help="ID of the eval task") +@click.option( + "--num-examples", required=False, help="Number of examples to evaluate on, useful for debugging", default=None +) +@click.pass_context +def run_benchmark(ctx, eval_task_id: str, num_examples: Optional[int]): + """Run a evaluation benchmark""" + + client = ctx.obj["client"] + + eval_task = client.eval_tasks.retrieve(name=eval_task_id) + scoring_functions = eval_task.scoring_functions + dataset_id = eval_task.dataset_id + + rows = client.datasetio.get_rows_paginated( + dataset_id=dataset_id, rows_in_page=-1 if num_examples is None else num_examples + ) + + for row in rows: + print(row) + + output_res = { + "chat_completion_input": [], + "generated_output": [], + "expected_output": [], + } + for x in scoring_functions: + output_res[x] = [] + + for r in tqdm(rows.rows): + eval_res = client.eval.evaluate_rows( + task_id=eval_task_id, + input_rows=[r], + scoring_functions=scoring_functions, + task_config={ + "type": "benchmark", + "eval_candidate": { + "type": "model", + "model": "Llama3.2-3B-Instruct", + "sampling_params": { + "strategy": "greedy", + "temperature": 0, + "top_p": 0.95, + "top_k": 0, + "max_tokens": 0, + "repetition_penalty": 1.0, + }, + }, + }, + ) + # if eval_tasks_list_response: + # table = Table() + # for header in headers: + # table.add_column(header) + + # for item in eval_tasks_list_response: + # table.add_row(*[str(getattr(item, header)) for header in headers]) + # console.print(table) diff --git a/src/llama_stack_client/lib/cli/llama_stack_client.py b/src/llama_stack_client/lib/cli/llama_stack_client.py index a624aaee..f6ef91bd 100644 --- a/src/llama_stack_client/lib/cli/llama_stack_client.py +++ b/src/llama_stack_client/lib/cli/llama_stack_client.py @@ -4,6 +4,8 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import os + import click import yaml @@ -12,6 +14,7 @@ from .constants import get_config_file_path from .datasets import datasets +from .eval import eval from .eval_tasks import eval_tasks from .memory_banks import memory_banks from .models import models @@ -50,7 +53,13 @@ def cli(ctx, endpoint: str, config: str | None): if endpoint == "": endpoint = "http://localhost:5000" - client = LlamaStackClient(base_url=endpoint) + client = LlamaStackClient( + base_url=endpoint, + provider_data={ + "fireworks_api_key": os.environ.get("FIREWORKS_API_KEY", ""), + "togethers_api_key": os.environ.get("TOGETHERS_API_KEY", ""), + }, + ) ctx.obj = {"client": client} @@ -63,6 +72,7 @@ def cli(ctx, endpoint: str, config: str | None): cli.add_command(datasets, "datasets") cli.add_command(configure, "configure") cli.add_command(scoring_functions, "scoring_functions") +cli.add_command(eval, "eval") def main(): diff --git a/src/llama_stack_client/lib/cli/subcommand.py b/src/llama_stack_client/lib/cli/subcommand.py deleted file mode 100644 index b97637ec..00000000 --- a/src/llama_stack_client/lib/cli/subcommand.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - - -class Subcommand: - """All llama cli subcommands must inherit from this class""" - - def __init__(self, *args, **kwargs): - pass - - @classmethod - def create(cls, *args, **kwargs): - return cls(*args, **kwargs) - - def _add_arguments(self): - pass From c638d4060e0e6a0d866a64a30131eab8571cd825 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Thu, 14 Nov 2024 12:11:26 -0500 Subject: [PATCH 2/6] run benchmark cli --- .../lib/cli/eval/run_benchmark.py | 74 ++++++++++--------- 1 file changed, 39 insertions(+), 35 deletions(-) diff --git a/src/llama_stack_client/lib/cli/eval/run_benchmark.py b/src/llama_stack_client/lib/cli/eval/run_benchmark.py index bbee84bf..1fe0fe57 100644 --- a/src/llama_stack_client/lib/cli/eval/run_benchmark.py +++ b/src/llama_stack_client/lib/cli/eval/run_benchmark.py @@ -4,22 +4,31 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import json +import os from typing import Optional import click - -# from rich.console import Console -# from rich.table import Table from tqdm.rich import tqdm @click.command("run_benchmark") @click.option("--eval-task-id", required=True, help="ID of the eval task") +@click.option( + "--eval-task-config", + required=True, + help="Path to the eval task config file in JSON format", +) +@click.option( + "--output-dir", + required=True, + help="Path to the dump eval results output directory", +) @click.option( "--num-examples", required=False, help="Number of examples to evaluate on, useful for debugging", default=None ) @click.pass_context -def run_benchmark(ctx, eval_task_id: str, num_examples: Optional[int]): +def run_benchmark(ctx, eval_task_id: str, eval_task_config: str, output_dir: str, num_examples: Optional[int]): """Run a evaluation benchmark""" client = ctx.obj["client"] @@ -32,43 +41,38 @@ def run_benchmark(ctx, eval_task_id: str, num_examples: Optional[int]): dataset_id=dataset_id, rows_in_page=-1 if num_examples is None else num_examples ) - for row in rows: - print(row) + with open(eval_task_config, "r") as f: + eval_task_config = json.load(f) - output_res = { - "chat_completion_input": [], - "generated_output": [], - "expected_output": [], - } - for x in scoring_functions: - output_res[x] = [] + output_res = {} for r in tqdm(rows.rows): eval_res = client.eval.evaluate_rows( task_id=eval_task_id, input_rows=[r], scoring_functions=scoring_functions, - task_config={ - "type": "benchmark", - "eval_candidate": { - "type": "model", - "model": "Llama3.2-3B-Instruct", - "sampling_params": { - "strategy": "greedy", - "temperature": 0, - "top_p": 0.95, - "top_k": 0, - "max_tokens": 0, - "repetition_penalty": 1.0, - }, - }, - }, + task_config=eval_task_config, ) - # if eval_tasks_list_response: - # table = Table() - # for header in headers: - # table.add_column(header) + for k in r.keys(): + if k not in output_res: + output_res[k] = [] + output_res[k].append(r[k]) + + for k in eval_res.generations[0].keys(): + if k not in output_res: + output_res[k] = [] + output_res[k].append(eval_res.generations[0][k]) + + for scoring_fn in scoring_functions: + if scoring_fn not in output_res: + output_res[scoring_fn] = [] + output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0]) + + # Create output directory if it doesn't exist + os.makedirs(output_dir, exist_ok=True) + # Save results to JSON file + output_file = os.path.join(output_dir, f"{eval_task_id}_results.json") + with open(output_file, "w") as f: + json.dump(output_res, f, indent=2) - # for item in eval_tasks_list_response: - # table.add_row(*[str(getattr(item, header)) for header in headers]) - # console.print(table) + print(f"Results saved to: {output_file}") From 3b352053e5bc94e87c3b671de6c1b9c39e758985 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Thu, 14 Nov 2024 12:16:58 -0500 Subject: [PATCH 3/6] precommit --- src/llama_stack_client/lib/cli/eval_tasks/__init__.py | 2 ++ src/llama_stack_client/lib/cli/memory_banks/__init__.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/llama_stack_client/lib/cli/eval_tasks/__init__.py b/src/llama_stack_client/lib/cli/eval_tasks/__init__.py index 010ffb76..d755c85a 100644 --- a/src/llama_stack_client/lib/cli/eval_tasks/__init__.py +++ b/src/llama_stack_client/lib/cli/eval_tasks/__init__.py @@ -5,3 +5,5 @@ # the root directory of this source tree. from .eval_tasks import eval_tasks + +__all__ = ["eval_tasks"] diff --git a/src/llama_stack_client/lib/cli/memory_banks/__init__.py b/src/llama_stack_client/lib/cli/memory_banks/__init__.py index eb25ec3d..eb9a0bc4 100644 --- a/src/llama_stack_client/lib/cli/memory_banks/__init__.py +++ b/src/llama_stack_client/lib/cli/memory_banks/__init__.py @@ -5,3 +5,5 @@ # the root directory of this source tree. from .memory_banks import memory_banks + +__all__ = ["memory_banks"] From 35cdfb389edd9e7796d7c75e0b07bcb6d9b134b1 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Thu, 14 Nov 2024 12:19:28 -0500 Subject: [PATCH 4/6] doc --- docs/cli_reference.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/docs/cli_reference.md b/docs/cli_reference.md index 1687f7a0..84b7b62d 100644 --- a/docs/cli_reference.md +++ b/docs/cli_reference.md @@ -124,3 +124,28 @@ $ llama-stack-client shields list | llama_guard | {} | meta-reference | llama_guard | +--------------+----------+----------------+-------------+ ``` + +#### `llama-stack-client eval_tasks list` +```bash +$ llama-stack-client eval run_benchmark --eval-task-id meta-reference-mmlu --num-examples 10 --output-dir ./ --eval-task-config ~/eval_task_config.json +``` + +where `eval_task_config.json` is the path to the eval task config file in JSON format. An example eval_task_config +``` +$ cat ~/eval_task_config.json +{ + "type": "benchmark", + "eval_candidate": { + "type": "model", + "model": "Llama3.1-405B-Instruct", + "sampling_params": { + "strategy": "greedy", + "temperature": 0, + "top_p": 0.95, + "top_k": 0, + "max_tokens": 0, + "repetition_penalty": 1.0 + } + } +} +``` From 91f0a670598a98cdb39c612606978ec500f3b657 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Thu, 14 Nov 2024 12:47:43 -0500 Subject: [PATCH 5/6] list of eval tasks --- .../lib/cli/eval/run_benchmark.py | 78 ++++++++++--------- 1 file changed, 41 insertions(+), 37 deletions(-) diff --git a/src/llama_stack_client/lib/cli/eval/run_benchmark.py b/src/llama_stack_client/lib/cli/eval/run_benchmark.py index 1fe0fe57..e197c778 100644 --- a/src/llama_stack_client/lib/cli/eval/run_benchmark.py +++ b/src/llama_stack_client/lib/cli/eval/run_benchmark.py @@ -13,11 +13,12 @@ @click.command("run_benchmark") -@click.option("--eval-task-id", required=True, help="ID of the eval task") +@click.argument("eval-task-ids", nargs=-1, required=True) # Changed from option to argument, supports multiple IDs @click.option( "--eval-task-config", required=True, help="Path to the eval task config file in JSON format", + type=click.Path(exists=True), ) @click.option( "--output-dir", @@ -28,51 +29,54 @@ "--num-examples", required=False, help="Number of examples to evaluate on, useful for debugging", default=None ) @click.pass_context -def run_benchmark(ctx, eval_task_id: str, eval_task_config: str, output_dir: str, num_examples: Optional[int]): +def run_benchmark( + ctx, eval_task_ids: tuple[str, ...], eval_task_config: str, output_dir: str, num_examples: Optional[int] +): """Run a evaluation benchmark""" client = ctx.obj["client"] - eval_task = client.eval_tasks.retrieve(name=eval_task_id) - scoring_functions = eval_task.scoring_functions - dataset_id = eval_task.dataset_id + for eval_task_id in eval_task_ids: + eval_task = client.eval_tasks.retrieve(name=eval_task_id) + scoring_functions = eval_task.scoring_functions + dataset_id = eval_task.dataset_id - rows = client.datasetio.get_rows_paginated( - dataset_id=dataset_id, rows_in_page=-1 if num_examples is None else num_examples - ) + rows = client.datasetio.get_rows_paginated( + dataset_id=dataset_id, rows_in_page=-1 if num_examples is None else num_examples + ) - with open(eval_task_config, "r") as f: - eval_task_config = json.load(f) + with open(eval_task_config, "r") as f: + eval_task_config = json.load(f) - output_res = {} + output_res = {} - for r in tqdm(rows.rows): - eval_res = client.eval.evaluate_rows( - task_id=eval_task_id, - input_rows=[r], - scoring_functions=scoring_functions, - task_config=eval_task_config, - ) - for k in r.keys(): - if k not in output_res: - output_res[k] = [] - output_res[k].append(r[k]) + for r in tqdm(rows.rows): + eval_res = client.eval.evaluate_rows( + task_id=eval_task_id, + input_rows=[r], + scoring_functions=scoring_functions, + task_config=eval_task_config, + ) + for k in r.keys(): + if k not in output_res: + output_res[k] = [] + output_res[k].append(r[k]) - for k in eval_res.generations[0].keys(): - if k not in output_res: - output_res[k] = [] - output_res[k].append(eval_res.generations[0][k]) + for k in eval_res.generations[0].keys(): + if k not in output_res: + output_res[k] = [] + output_res[k].append(eval_res.generations[0][k]) - for scoring_fn in scoring_functions: - if scoring_fn not in output_res: - output_res[scoring_fn] = [] - output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0]) + for scoring_fn in scoring_functions: + if scoring_fn not in output_res: + output_res[scoring_fn] = [] + output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0]) - # Create output directory if it doesn't exist - os.makedirs(output_dir, exist_ok=True) - # Save results to JSON file - output_file = os.path.join(output_dir, f"{eval_task_id}_results.json") - with open(output_file, "w") as f: - json.dump(output_res, f, indent=2) + # Create output directory if it doesn't exist + os.makedirs(output_dir, exist_ok=True) + # Save results to JSON file + output_file = os.path.join(output_dir, f"{eval_task_id}_results.json") + with open(output_file, "w") as f: + json.dump(output_res, f, indent=2) - print(f"Results saved to: {output_file}") + print(f"Results saved to: {output_file}") From 5bbb609366de0fc90359371004d5307328f5eb1e Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Thu, 14 Nov 2024 13:26:28 -0500 Subject: [PATCH 6/6] update doc --- docs/cli_reference.md | 2 +- src/llama_stack_client/lib/cli/eval/run_benchmark.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/cli_reference.md b/docs/cli_reference.md index 84b7b62d..b112416f 100644 --- a/docs/cli_reference.md +++ b/docs/cli_reference.md @@ -127,7 +127,7 @@ $ llama-stack-client shields list #### `llama-stack-client eval_tasks list` ```bash -$ llama-stack-client eval run_benchmark --eval-task-id meta-reference-mmlu --num-examples 10 --output-dir ./ --eval-task-config ~/eval_task_config.json +$ llama-stack-client eval run_benchmark --num-examples 10 --output-dir ./ --eval-task-config ~/eval_task_config.json ``` where `eval_task_config.json` is the path to the eval task config file in JSON format. An example eval_task_config diff --git a/src/llama_stack_client/lib/cli/eval/run_benchmark.py b/src/llama_stack_client/lib/cli/eval/run_benchmark.py index e197c778..24f1c791 100644 --- a/src/llama_stack_client/lib/cli/eval/run_benchmark.py +++ b/src/llama_stack_client/lib/cli/eval/run_benchmark.py @@ -13,7 +13,7 @@ @click.command("run_benchmark") -@click.argument("eval-task-ids", nargs=-1, required=True) # Changed from option to argument, supports multiple IDs +@click.argument("eval-task-ids", nargs=-1, required=True) @click.option( "--eval-task-config", required=True,