pre-commit

SLR722 · SLR722 · commit 31296fb9be0c · 2025-02-20T16:22:40.000-08:00
diff --git a/src/llama_stack_client/lib/cli/eval/run_benchmark.py b/src/llama_stack_client/lib/cli/eval/run_benchmark.py
@@ -148,21 +148,13 @@ def run_benchmark(
                     for aggregation_function in aggregation_functions:
                         scoring_results = output_res[scoring_fn]
                         if aggregation_function == "categorical_count":
-                            output_res[scoring_fn].append(
-                                aggregate_categorical_count(scoring_results)
-                            )
+                            output_res[scoring_fn].append(aggregate_categorical_count(scoring_results))
                         elif aggregation_function == "average":
-                            output_res[scoring_fn].append(
-                                aggregate_average(scoring_results)
-                            )
+                            output_res[scoring_fn].append(aggregate_average(scoring_results))
                         elif aggregation_function == "median":
-                            output_res[scoring_fn].append(
-                                aggregate_median(scoring_results)
-                            )
+                            output_res[scoring_fn].append(aggregate_median(scoring_results))
                         elif aggregation_function == "accuracy":
-                            output_res[scoring_fn].append(
-                                aggregate_accuracy(scoring_results)
-                            )
+                            output_res[scoring_fn].append(aggregate_accuracy(scoring_results))
                         else:
                             raise NotImplementedError(
                                 f"Aggregation function {aggregation_function} is not supported yet"
diff --git a/src/llama_stack_client/lib/cli/eval/utils.py b/src/llama_stack_client/lib/cli/eval/utils.py
@@ -8,42 +8,32 @@
 
 
 def aggregate_categorical_count(
-    scoring_results: List[
-        Dict[str, Union[bool, float, str, List[object], object, None]]
-    ],
+    scoring_results: List[Dict[str, Union[bool, float, str, List[object], object, None]]],
 ) -> Dict[str, Any]:
     scores = [str(r["score"]) for r in scoring_results]
     unique_scores = sorted(list(set(scores)))
     return {"categorical_count": {s: scores.count(s) for s in unique_scores}}
 
 
 def aggregate_average(
-    scoring_results: List[
-        Dict[str, Union[bool, float, str, List[object], object, None]]
-    ],
+    scoring_results: List[Dict[str, Union[bool, float, str, List[object], object, None]]],
 ) -> Dict[str, Any]:
     return {
-        "average": sum(
-            result["score"] for result in scoring_results if result["score"] is not None
-        )
+        "average": sum(result["score"] for result in scoring_results if result["score"] is not None)
         / len([_ for _ in scoring_results if _["score"] is not None]),
     }
 
 
 def aggregate_median(
-    scoring_results: List[
-        Dict[str, Union[bool, float, str, List[object], object, None]]
-    ],
+    scoring_results: List[Dict[str, Union[bool, float, str, List[object], object, None]]],
 ) -> Dict[str, Any]:
     scores = [r["score"] for r in scoring_results if r["score"] is not None]
     median = statistics.median(scores) if scores else None
     return {"median": median}
 
 
 def aggregate_accuracy(
-    scoring_results: List[
-        Dict[str, Union[bool, float, str, List[object], object, None]]
-    ],
+    scoring_results: List[Dict[str, Union[bool, float, str, List[object], object, None]]],
 ) -> Dict[str, Any]:
     num_correct = sum(result["score"] for result in scoring_results)
     avg_score = num_correct / len(scoring_results)