From 1a7eddfd91df17d56782e47a13017e92636a0bee Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Thu, 14 Nov 2024 11:23:46 -0500
Subject: [PATCH 1/6] wip

---
 .../lib/cli/eval/__init__.py                  |  9 +++
 src/llama_stack_client/lib/cli/eval/eval.py   | 20 +++++
 .../lib/cli/eval/run_benchmark.py             | 74 +++++++++++++++++++
 .../lib/cli/llama_stack_client.py             | 12 ++-
 src/llama_stack_client/lib/cli/subcommand.py  | 19 -----
 5 files changed, 114 insertions(+), 20 deletions(-)
 create mode 100644 src/llama_stack_client/lib/cli/eval/__init__.py
 create mode 100644 src/llama_stack_client/lib/cli/eval/eval.py
 create mode 100644 src/llama_stack_client/lib/cli/eval/run_benchmark.py
 delete mode 100644 src/llama_stack_client/lib/cli/subcommand.py

diff --git a/src/llama_stack_client/lib/cli/eval/__init__.py b/src/llama_stack_client/lib/cli/eval/__init__.py
new file mode 100644
index 00000000..503994e9
--- /dev/null
+++ b/src/llama_stack_client/lib/cli/eval/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .eval import eval
+
+__all__ = ["eval"]
diff --git a/src/llama_stack_client/lib/cli/eval/eval.py b/src/llama_stack_client/lib/cli/eval/eval.py
new file mode 100644
index 00000000..6ff9aa2b
--- /dev/null
+++ b/src/llama_stack_client/lib/cli/eval/eval.py
@@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+import click
+
+from .run_benchmark import run_benchmark
+
+
+@click.group()
+def eval():
+    """Run evaluation tasks"""
+    pass
+
+
+# Register subcommands
+eval.add_command(run_benchmark)
diff --git a/src/llama_stack_client/lib/cli/eval/run_benchmark.py b/src/llama_stack_client/lib/cli/eval/run_benchmark.py
new file mode 100644
index 00000000..bbee84bf
--- /dev/null
+++ b/src/llama_stack_client/lib/cli/eval/run_benchmark.py
@@ -0,0 +1,74 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Optional
+
+import click
+
+# from rich.console import Console
+# from rich.table import Table
+from tqdm.rich import tqdm
+
+
+@click.command("run_benchmark")
+@click.option("--eval-task-id", required=True, help="ID of the eval task")
+@click.option(
+    "--num-examples", required=False, help="Number of examples to evaluate on, useful for debugging", default=None
+)
+@click.pass_context
+def run_benchmark(ctx, eval_task_id: str, num_examples: Optional[int]):
+    """Run a evaluation benchmark"""
+
+    client = ctx.obj["client"]
+
+    eval_task = client.eval_tasks.retrieve(name=eval_task_id)
+    scoring_functions = eval_task.scoring_functions
+    dataset_id = eval_task.dataset_id
+
+    rows = client.datasetio.get_rows_paginated(
+        dataset_id=dataset_id, rows_in_page=-1 if num_examples is None else num_examples
+    )
+
+    for row in rows:
+        print(row)
+
+    output_res = {
+        "chat_completion_input": [],
+        "generated_output": [],
+        "expected_output": [],
+    }
+    for x in scoring_functions:
+        output_res[x] = []
+
+    for r in tqdm(rows.rows):
+        eval_res = client.eval.evaluate_rows(
+            task_id=eval_task_id,
+            input_rows=[r],
+            scoring_functions=scoring_functions,
+            task_config={
+                "type": "benchmark",
+                "eval_candidate": {
+                    "type": "model",
+                    "model": "Llama3.2-3B-Instruct",
+                    "sampling_params": {
+                        "strategy": "greedy",
+                        "temperature": 0,
+                        "top_p": 0.95,
+                        "top_k": 0,
+                        "max_tokens": 0,
+                        "repetition_penalty": 1.0,
+                    },
+                },
+            },
+        )
+    # if eval_tasks_list_response:
+    #     table = Table()
+    #     for header in headers:
+    #         table.add_column(header)
+
+    #     for item in eval_tasks_list_response:
+    #         table.add_row(*[str(getattr(item, header)) for header in headers])
+    #     console.print(table)
diff --git a/src/llama_stack_client/lib/cli/llama_stack_client.py b/src/llama_stack_client/lib/cli/llama_stack_client.py
index a624aaee..f6ef91bd 100644
--- a/src/llama_stack_client/lib/cli/llama_stack_client.py
+++ b/src/llama_stack_client/lib/cli/llama_stack_client.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import os
+
 import click
 import yaml
 
@@ -12,6 +14,7 @@
 
 from .constants import get_config_file_path
 from .datasets import datasets
+from .eval import eval
 from .eval_tasks import eval_tasks
 from .memory_banks import memory_banks
 from .models import models
@@ -50,7 +53,13 @@ def cli(ctx, endpoint: str, config: str | None):
     if endpoint == "":
         endpoint = "http://localhost:5000"
 
-    client = LlamaStackClient(base_url=endpoint)
+    client = LlamaStackClient(
+        base_url=endpoint,
+        provider_data={
+            "fireworks_api_key": os.environ.get("FIREWORKS_API_KEY", ""),
+            "togethers_api_key": os.environ.get("TOGETHERS_API_KEY", ""),
+        },
+    )
     ctx.obj = {"client": client}
 
 
@@ -63,6 +72,7 @@ def cli(ctx, endpoint: str, config: str | None):
 cli.add_command(datasets, "datasets")
 cli.add_command(configure, "configure")
 cli.add_command(scoring_functions, "scoring_functions")
+cli.add_command(eval, "eval")
 
 
 def main():
diff --git a/src/llama_stack_client/lib/cli/subcommand.py b/src/llama_stack_client/lib/cli/subcommand.py
deleted file mode 100644
index b97637ec..00000000
--- a/src/llama_stack_client/lib/cli/subcommand.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-class Subcommand:
-    """All llama cli subcommands must inherit from this class"""
-
-    def __init__(self, *args, **kwargs):
-        pass
-
-    @classmethod
-    def create(cls, *args, **kwargs):
-        return cls(*args, **kwargs)
-
-    def _add_arguments(self):
-        pass

From c638d4060e0e6a0d866a64a30131eab8571cd825 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Thu, 14 Nov 2024 12:11:26 -0500
Subject: [PATCH 2/6] run benchmark cli

---
 .../lib/cli/eval/run_benchmark.py             | 74 ++++++++++---------
 1 file changed, 39 insertions(+), 35 deletions(-)

diff --git a/src/llama_stack_client/lib/cli/eval/run_benchmark.py b/src/llama_stack_client/lib/cli/eval/run_benchmark.py
index bbee84bf..1fe0fe57 100644
--- a/src/llama_stack_client/lib/cli/eval/run_benchmark.py
+++ b/src/llama_stack_client/lib/cli/eval/run_benchmark.py
@@ -4,22 +4,31 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import json
+import os
 from typing import Optional
 
 import click
-
-# from rich.console import Console
-# from rich.table import Table
 from tqdm.rich import tqdm
 
 
 @click.command("run_benchmark")
 @click.option("--eval-task-id", required=True, help="ID of the eval task")
+@click.option(
+    "--eval-task-config",
+    required=True,
+    help="Path to the eval task config file in JSON format",
+)
+@click.option(
+    "--output-dir",
+    required=True,
+    help="Path to the dump eval results output directory",
+)
 @click.option(
     "--num-examples", required=False, help="Number of examples to evaluate on, useful for debugging", default=None
 )
 @click.pass_context
-def run_benchmark(ctx, eval_task_id: str, num_examples: Optional[int]):
+def run_benchmark(ctx, eval_task_id: str, eval_task_config: str, output_dir: str, num_examples: Optional[int]):
     """Run a evaluation benchmark"""
 
     client = ctx.obj["client"]
@@ -32,43 +41,38 @@ def run_benchmark(ctx, eval_task_id: str, num_examples: Optional[int]):
         dataset_id=dataset_id, rows_in_page=-1 if num_examples is None else num_examples
     )
 
-    for row in rows:
-        print(row)
+    with open(eval_task_config, "r") as f:
+        eval_task_config = json.load(f)
 
-    output_res = {
-        "chat_completion_input": [],
-        "generated_output": [],
-        "expected_output": [],
-    }
-    for x in scoring_functions:
-        output_res[x] = []
+    output_res = {}
 
     for r in tqdm(rows.rows):
         eval_res = client.eval.evaluate_rows(
             task_id=eval_task_id,
             input_rows=[r],
             scoring_functions=scoring_functions,
-            task_config={
-                "type": "benchmark",
-                "eval_candidate": {
-                    "type": "model",
-                    "model": "Llama3.2-3B-Instruct",
-                    "sampling_params": {
-                        "strategy": "greedy",
-                        "temperature": 0,
-                        "top_p": 0.95,
-                        "top_k": 0,
-                        "max_tokens": 0,
-                        "repetition_penalty": 1.0,
-                    },
-                },
-            },
+            task_config=eval_task_config,
         )
-    # if eval_tasks_list_response:
-    #     table = Table()
-    #     for header in headers:
-    #         table.add_column(header)
+        for k in r.keys():
+            if k not in output_res:
+                output_res[k] = []
+            output_res[k].append(r[k])
+
+        for k in eval_res.generations[0].keys():
+            if k not in output_res:
+                output_res[k] = []
+            output_res[k].append(eval_res.generations[0][k])
+
+        for scoring_fn in scoring_functions:
+            if scoring_fn not in output_res:
+                output_res[scoring_fn] = []
+            output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0])
+
+    # Create output directory if it doesn't exist
+    os.makedirs(output_dir, exist_ok=True)
+    # Save results to JSON file
+    output_file = os.path.join(output_dir, f"{eval_task_id}_results.json")
+    with open(output_file, "w") as f:
+        json.dump(output_res, f, indent=2)
 
-    #     for item in eval_tasks_list_response:
-    #         table.add_row(*[str(getattr(item, header)) for header in headers])
-    #     console.print(table)
+    print(f"Results saved to: {output_file}")

From 3b352053e5bc94e87c3b671de6c1b9c39e758985 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Thu, 14 Nov 2024 12:16:58 -0500
Subject: [PATCH 3/6] precommit

---
 src/llama_stack_client/lib/cli/eval_tasks/__init__.py   | 2 ++
 src/llama_stack_client/lib/cli/memory_banks/__init__.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/llama_stack_client/lib/cli/eval_tasks/__init__.py b/src/llama_stack_client/lib/cli/eval_tasks/__init__.py
index 010ffb76..d755c85a 100644
--- a/src/llama_stack_client/lib/cli/eval_tasks/__init__.py
+++ b/src/llama_stack_client/lib/cli/eval_tasks/__init__.py
@@ -5,3 +5,5 @@
 # the root directory of this source tree.
 
 from .eval_tasks import eval_tasks
+
+__all__ = ["eval_tasks"]
diff --git a/src/llama_stack_client/lib/cli/memory_banks/__init__.py b/src/llama_stack_client/lib/cli/memory_banks/__init__.py
index eb25ec3d..eb9a0bc4 100644
--- a/src/llama_stack_client/lib/cli/memory_banks/__init__.py
+++ b/src/llama_stack_client/lib/cli/memory_banks/__init__.py
@@ -5,3 +5,5 @@
 # the root directory of this source tree.
 
 from .memory_banks import memory_banks
+
+__all__ = ["memory_banks"]

From 35cdfb389edd9e7796d7c75e0b07bcb6d9b134b1 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Thu, 14 Nov 2024 12:19:28 -0500
Subject: [PATCH 4/6] doc

---
 docs/cli_reference.md | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/docs/cli_reference.md b/docs/cli_reference.md
index 1687f7a0..84b7b62d 100644
--- a/docs/cli_reference.md
+++ b/docs/cli_reference.md
@@ -124,3 +124,28 @@ $ llama-stack-client shields list
 | llama_guard  | {}       | meta-reference | llama_guard |
 +--------------+----------+----------------+-------------+
 ```
+
+#### `llama-stack-client eval_tasks list`
+```bash
+$ llama-stack-client eval run_benchmark --eval-task-id meta-reference-mmlu --num-examples 10 --output-dir ./ --eval-task-config ~/eval_task_config.json
+```
+
+where `eval_task_config.json` is the path to the eval task config file in JSON format. An example eval_task_config
+```
+$ cat ~/eval_task_config.json
+{
+    "type": "benchmark",
+    "eval_candidate": {
+        "type": "model",
+        "model": "Llama3.1-405B-Instruct",
+        "sampling_params": {
+            "strategy": "greedy",
+            "temperature": 0,
+            "top_p": 0.95,
+            "top_k": 0,
+            "max_tokens": 0,
+            "repetition_penalty": 1.0
+        }
+    }
+}
+```

From 91f0a670598a98cdb39c612606978ec500f3b657 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Thu, 14 Nov 2024 12:47:43 -0500
Subject: [PATCH 5/6] list of eval tasks

---
 .../lib/cli/eval/run_benchmark.py             | 78 ++++++++++---------
 1 file changed, 41 insertions(+), 37 deletions(-)

diff --git a/src/llama_stack_client/lib/cli/eval/run_benchmark.py b/src/llama_stack_client/lib/cli/eval/run_benchmark.py
index 1fe0fe57..e197c778 100644
--- a/src/llama_stack_client/lib/cli/eval/run_benchmark.py
+++ b/src/llama_stack_client/lib/cli/eval/run_benchmark.py
@@ -13,11 +13,12 @@
 
 
 @click.command("run_benchmark")
-@click.option("--eval-task-id", required=True, help="ID of the eval task")
+@click.argument("eval-task-ids", nargs=-1, required=True)  # Changed from option to argument, supports multiple IDs
 @click.option(
     "--eval-task-config",
     required=True,
     help="Path to the eval task config file in JSON format",
+    type=click.Path(exists=True),
 )
 @click.option(
     "--output-dir",
@@ -28,51 +29,54 @@
     "--num-examples", required=False, help="Number of examples to evaluate on, useful for debugging", default=None
 )
 @click.pass_context
-def run_benchmark(ctx, eval_task_id: str, eval_task_config: str, output_dir: str, num_examples: Optional[int]):
+def run_benchmark(
+    ctx, eval_task_ids: tuple[str, ...], eval_task_config: str, output_dir: str, num_examples: Optional[int]
+):
     """Run a evaluation benchmark"""
 
     client = ctx.obj["client"]
 
-    eval_task = client.eval_tasks.retrieve(name=eval_task_id)
-    scoring_functions = eval_task.scoring_functions
-    dataset_id = eval_task.dataset_id
+    for eval_task_id in eval_task_ids:
+        eval_task = client.eval_tasks.retrieve(name=eval_task_id)
+        scoring_functions = eval_task.scoring_functions
+        dataset_id = eval_task.dataset_id
 
-    rows = client.datasetio.get_rows_paginated(
-        dataset_id=dataset_id, rows_in_page=-1 if num_examples is None else num_examples
-    )
+        rows = client.datasetio.get_rows_paginated(
+            dataset_id=dataset_id, rows_in_page=-1 if num_examples is None else num_examples
+        )
 
-    with open(eval_task_config, "r") as f:
-        eval_task_config = json.load(f)
+        with open(eval_task_config, "r") as f:
+            eval_task_config = json.load(f)
 
-    output_res = {}
+        output_res = {}
 
-    for r in tqdm(rows.rows):
-        eval_res = client.eval.evaluate_rows(
-            task_id=eval_task_id,
-            input_rows=[r],
-            scoring_functions=scoring_functions,
-            task_config=eval_task_config,
-        )
-        for k in r.keys():
-            if k not in output_res:
-                output_res[k] = []
-            output_res[k].append(r[k])
+        for r in tqdm(rows.rows):
+            eval_res = client.eval.evaluate_rows(
+                task_id=eval_task_id,
+                input_rows=[r],
+                scoring_functions=scoring_functions,
+                task_config=eval_task_config,
+            )
+            for k in r.keys():
+                if k not in output_res:
+                    output_res[k] = []
+                output_res[k].append(r[k])
 
-        for k in eval_res.generations[0].keys():
-            if k not in output_res:
-                output_res[k] = []
-            output_res[k].append(eval_res.generations[0][k])
+            for k in eval_res.generations[0].keys():
+                if k not in output_res:
+                    output_res[k] = []
+                output_res[k].append(eval_res.generations[0][k])
 
-        for scoring_fn in scoring_functions:
-            if scoring_fn not in output_res:
-                output_res[scoring_fn] = []
-            output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0])
+            for scoring_fn in scoring_functions:
+                if scoring_fn not in output_res:
+                    output_res[scoring_fn] = []
+                output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0])
 
-    # Create output directory if it doesn't exist
-    os.makedirs(output_dir, exist_ok=True)
-    # Save results to JSON file
-    output_file = os.path.join(output_dir, f"{eval_task_id}_results.json")
-    with open(output_file, "w") as f:
-        json.dump(output_res, f, indent=2)
+        # Create output directory if it doesn't exist
+        os.makedirs(output_dir, exist_ok=True)
+        # Save results to JSON file
+        output_file = os.path.join(output_dir, f"{eval_task_id}_results.json")
+        with open(output_file, "w") as f:
+            json.dump(output_res, f, indent=2)
 
-    print(f"Results saved to: {output_file}")
+        print(f"Results saved to: {output_file}")

From 5bbb609366de0fc90359371004d5307328f5eb1e Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Thu, 14 Nov 2024 13:26:28 -0500
Subject: [PATCH 6/6] update doc

---
 docs/cli_reference.md                                | 2 +-
 src/llama_stack_client/lib/cli/eval/run_benchmark.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/cli_reference.md b/docs/cli_reference.md
index 84b7b62d..b112416f 100644
--- a/docs/cli_reference.md
+++ b/docs/cli_reference.md
@@ -127,7 +127,7 @@ $ llama-stack-client shields list
 
 #### `llama-stack-client eval_tasks list`
 ```bash
-$ llama-stack-client eval run_benchmark --eval-task-id meta-reference-mmlu --num-examples 10 --output-dir ./ --eval-task-config ~/eval_task_config.json
+$ llama-stack-client eval run_benchmark <task_id1> <task_id2> --num-examples 10 --output-dir ./ --eval-task-config ~/eval_task_config.json
 ```
 
 where `eval_task_config.json` is the path to the eval task config file in JSON format. An example eval_task_config
diff --git a/src/llama_stack_client/lib/cli/eval/run_benchmark.py b/src/llama_stack_client/lib/cli/eval/run_benchmark.py
index e197c778..24f1c791 100644
--- a/src/llama_stack_client/lib/cli/eval/run_benchmark.py
+++ b/src/llama_stack_client/lib/cli/eval/run_benchmark.py
@@ -13,7 +13,7 @@
 
 
 @click.command("run_benchmark")
-@click.argument("eval-task-ids", nargs=-1, required=True)  # Changed from option to argument, supports multiple IDs
+@click.argument("eval-task-ids", nargs=-1, required=True)
 @click.option(
     "--eval-task-config",
     required=True,