Merge pull request #31 from meta-llama/pretty_table

yanxi0830 · web-flow · commit ecf6a48a0367 · 2024-11-15T15:49:19.000-05:00
[CLI] visualize categorical scores eval results with bars
diff --git a/pyproject.toml b/pyproject.toml
@@ -15,7 +15,6 @@ dependencies = [
     "distro>=1.7.0, <2",
     "sniffio",
     "cached-property; python_version < '3.8'",
-    "tabulate>=0.9.0",
 ]
 requires-python = ">= 3.7"
 classifiers = [
diff --git a/requirements-dev.lock b/requirements-dev.lock
@@ -90,10 +90,6 @@ sniffio==1.3.0
     # via anyio
     # via httpx
     # via llama-stack-client
-tabulate==0.9.0
-    # via llama-stack-client
-termcolor==2.4.0
-    # via llama-stack-client
 time-machine==2.9.0
 tomli==2.0.1
     # via mypy
diff --git a/requirements.lock b/requirements.lock
@@ -39,10 +39,6 @@ sniffio==1.3.0
     # via anyio
     # via httpx
     # via llama-stack-client
-tabulate==0.9.0
-    # via llama-stack-client
-termcolor==2.4.0
-    # via llama-stack-client
 typing-extensions==4.8.0
     # via anyio
     # via llama-stack-client
diff --git a/src/llama_stack_client/lib/cli/__init__.py b/src/llama_stack_client/lib/cli/__init__.py
@@ -3,3 +3,10 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+
+# Ignore tqdm experimental warning
+import warnings
+
+from tqdm import TqdmExperimentalWarning
+
+warnings.filterwarnings("ignore", category=TqdmExperimentalWarning)
diff --git a/src/llama_stack_client/lib/cli/common/utils.py b/src/llama_stack_client/lib/cli/common/utils.py
@@ -3,15 +3,28 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from tabulate import tabulate
+from rich.console import Console
+from rich.table import Table
 
 
-def print_table_from_response(response, headers=()):
-    if not headers:
-        headers = sorted(response[0].__dict__.keys())
+def create_bar_chart(data, labels, title=""):
+    """Create a bar chart using Rich Table."""
 
-    rows = []
-    for spec in response:
-        rows.append([spec.__dict__[headers[i]] for i in range(len(headers))])
+    console = Console()
+    table = Table(title=title)
+    table.add_column("Score")
+    table.add_column("Count")
 
-    print(tabulate(rows, headers=headers, tablefmt="grid"))
+    max_value = max(data)
+    total_count = sum(data)
+
+    # Define a list of colors to cycle through
+    colors = ["green", "blue", "red", "yellow", "magenta", "cyan"]
+
+    for i, (label, value) in enumerate(zip(labels, data)):
+        bar_length = int((value / max_value) * 20)  # Adjust bar length as needed
+        bar = "█" * bar_length + " " * (20 - bar_length)
+        color = colors[i % len(colors)]
+        table.add_row(label, f"[{color}]{bar}[/] {value}/{total_count}")
+
+    console.print(table)
diff --git a/src/llama_stack_client/lib/cli/eval/run_benchmark.py b/src/llama_stack_client/lib/cli/eval/run_benchmark.py
@@ -9,8 +9,11 @@
 from typing import Optional
 
 import click
+from rich import print as rprint
 from tqdm.rich import tqdm
 
+from ..common.utils import create_bar_chart
+
 
 @click.command("run_benchmark")
 @click.argument("eval-task-ids", nargs=-1, required=True)
@@ -28,9 +31,20 @@
 @click.option(
     "--num-examples", required=False, help="Number of examples to evaluate on, useful for debugging", default=None
 )
+@click.option(
+    "--visualize",
+    is_flag=True,
+    default=False,
+    help="Visualize evaluation results after completion",
+)
 @click.pass_context
 def run_benchmark(
-    ctx, eval_task_ids: tuple[str, ...], eval_task_config: str, output_dir: str, num_examples: Optional[int]
+    ctx,
+    eval_task_ids: tuple[str, ...],
+    eval_task_config: str,
+    output_dir: str,
+    num_examples: Optional[int],
+    visualize: bool,
 ):
     """Run a evaluation benchmark"""
 
@@ -79,4 +93,13 @@ def run_benchmark(
         with open(output_file, "w") as f:
             json.dump(output_res, f, indent=2)
 
-        print(f"Results saved to: {output_file}")
+        rprint(f"[green]✓[/green] Results saved to: [blue]{output_file}[/blue]!\n")
+
+        if visualize:
+            for scoring_fn in scoring_functions:
+                res = output_res[scoring_fn]
+                assert len(res) > 0 and "score" in res[0]
+                scores = [str(r["score"]) for r in res]
+                unique_scores = sorted(list(set(scores)))
+                counts = [scores.count(s) for s in unique_scores]
+                create_bar_chart(counts, unique_scores, title=f"{scoring_fn}")
diff --git a/src/llama_stack_client/lib/cli/llama_stack_client.py b/src/llama_stack_client/lib/cli/llama_stack_client.py
@@ -57,7 +57,7 @@ def cli(ctx, endpoint: str, config: str | None):
         base_url=endpoint,
         provider_data={
             "fireworks_api_key": os.environ.get("FIREWORKS_API_KEY", ""),
-            "togethers_api_key": os.environ.get("TOGETHERS_API_KEY", ""),
+            "together_api_key": os.environ.get("TOGETHER_API_KEY", ""),
         },
     )
     ctx.obj = {"client": client}

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,6 @@ dependencies = [`
`15`	`15`	`"distro>=1.7.0, <2",`
`16`	`16`	`"sniffio",`
`17`	`17`	`"cached-property; python_version < '3.8'",`
`18`		`- "tabulate>=0.9.0",`
`19`	`18`	`]`
`20`	`19`	`requires-python = ">= 3.7"`
`21`	`20`	`classifiers = [`
Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,7 @@ def cli(ctx, endpoint: str, config: str \| None):`
`57`	`57`	`base_url=endpoint,`
`58`	`58`	`provider_data={`
`59`	`59`	`"fireworks_api_key": os.environ.get("FIREWORKS_API_KEY", ""),`
`60`		`- "togethers_api_key": os.environ.get("TOGETHERS_API_KEY", ""),`
	`60`	`+ "together_api_key": os.environ.get("TOGETHER_API_KEY", ""),`
`61`	`61`	`},`
`62`	`62`	`)`
`63`	`63`	`ctx.obj = {"client": client}`