automl
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎notebooks/llm_as_judge_tutorial.ipynb‎
Lines changed: 0 additions & 467 deletions b/‎notebooks/llm_as_judge_tutorial.ipynb‎
Lines changed: 0 additions & 467 deletions
diff --git a/‎notebooks/reward_task_tutorial.ipynb‎
Lines changed: 0 additions & 297 deletions b/‎notebooks/reward_task_tutorial.ipynb‎
Lines changed: 0 additions & 297 deletions
diff --git a/‎promptolution/tasks/base_task.py‎
Lines changed: 4 additions & 5 deletions b/‎promptolution/tasks/base_task.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎promptolution/tasks/classification_tasks.py‎
Lines changed: 6 additions & 3 deletions b/‎promptolution/tasks/classification_tasks.py‎
Lines changed: 6 additions & 3 deletions
@@ -36,7 +36,7 @@ to install the necessary dependencies. You might need to install [pipx](https://
 
 ## Usage
 
-To get started right away, take a look at our [getting started notebook](https://github.com/finitearth/promptolution/blob/main/notebooks/getting_started.ipynb).
+To get started right away, take a look at our [getting started notebook](https://github.com/finitearth/promptolution/blob/main/tutorials/getting_started.ipynb) and our [other demos and tutorials](https://github.com/finitearth/promptolution/blob/main/tutorials).
 For more details, a comprehensive **documentation** with API reference is availabe at https://finitearth.github.io/promptolution/.
 
 ### Featured Optimizers
 
@@ -147,8 +147,8 @@ def _collect_results_from_cache(
         return scores if not return_seq else (scores, seqs)
 
     @abstractmethod
-    def _single_evaluate(self, x: np.ndarray, y: np.ndarray, pred: np.ndarray) -> float:
-        """Abstract method to calculate the score for a single prediction.
+    def _evaluate(self, xs: np.ndarray, ys: np.ndarray, preds: np.ndarray) -> List[float]:
+        """Abstract method to calculate the score for a predictions.
 
         This method should be implemented by subclasses based on their specific evaluation logic.
         """
@@ -183,10 +183,9 @@ def evaluate(
 
         if return_seq:
             preds, seqs = preds
-
+        scores = self._evaluate(xs_to_evaluate, ys_to_evaluate, preds)
         for i, cache_key in enumerate(batches):
-            x, y, y_pred = xs_to_evaluate[i], ys_to_evaluate[i], preds[i]
-            self.eval_cache[cache_key] = self._single_evaluate(x, y, y_pred)
+            self.eval_cache[cache_key] = scores[i]
 
             if return_seq:
                 self.seq_cache[cache_key] = seqs[i]
 
@@ -5,7 +5,7 @@
 import pandas as pd
 from sklearn.metrics import accuracy_score
 
-from typing import TYPE_CHECKING, Callable, Literal
+from typing import TYPE_CHECKING, Callable, List, Literal
 
 from promptolution.tasks.base_task import BaseTask
 
@@ -65,6 +65,9 @@ def __init__(
         self.ys = df[self.y_column].str.lower().values  # Ensure y values are lowercase for consistent comparison
         self.classes = np.unique(self.ys)
 
-    def _single_evaluate(self, x: np.ndarray, y: np.ndarray, pred: np.ndarray) -> float:
+    def _evaluate(self, xs: np.ndarray, ys: np.ndarray, preds: np.ndarray) -> List[float]:
         """Calculate the score for a single prediction."""
-        return self.metric([y], [pred])
+        scores = []
+        for pred, y in zip(preds, ys):
+            scores.append(self.metric([y], [pred]))
+        return scores