langfuse · hassiebp · Nov 14, 2025 · Nov 12, 2025 · Nov 12, 2025 · Nov 12, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -88,7 +88,7 @@ jobs:
       - name: Setup node (for langfuse server)
         uses: actions/setup-node@v3
         with:
-          node-version: 20
+          node-version: 24
 
       - name: Cache langfuse server dependencies
         uses: actions/cache@v3

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.3.2
+    rev: v0.14.4
     hooks:
       # Run the linter and fix
       - id: ruff
@@ -10,6 +10,7 @@ repos:
       # Run the formatter.
       - id: ruff-format
         types_or: [python, pyi, jupyter]
+        args: [--config=ci.ruff.toml]
 
   - repo: https://github.com/pre-commit/mirrors-mypy
     rev: v1.18.2

diff --git a/langfuse/__init__.py b/langfuse/__init__.py
@@ -1,5 +1,13 @@
 """.. include:: ../README.md"""
 
+from langfuse.batch_evaluation import (
+    BatchEvaluationResult,
+    BatchEvaluationResumeToken,
+    CompositeEvaluatorFunction,
+    EvaluatorInputs,
+    EvaluatorStats,
+    MapperFunction,
+)
 from langfuse.experiment import Evaluation
 
 from ._client import client as _client_module
@@ -41,6 +49,12 @@
     "LangfuseRetriever",
     "LangfuseGuardrail",
     "Evaluation",
+    "EvaluatorInputs",
+    "MapperFunction",
+    "CompositeEvaluatorFunction",
+    "EvaluatorStats",
+    "BatchEvaluationResumeToken",
+    "BatchEvaluationResult",
     "experiment",
     "api",
 ]
diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py
diff --git a/langfuse/_client/datasets.py b/langfuse/_client/datasets.py
@@ -4,6 +4,7 @@
 
 from opentelemetry.util._decorator import _agnosticcontextmanager
 
+from langfuse.batch_evaluation import CompositeEvaluatorFunction
 from langfuse.experiment import (
     EvaluatorFunction,
     ExperimentResult,
@@ -204,6 +205,7 @@ def run_experiment(
         description: Optional[str] = None,
         task: TaskFunction,
         evaluators: List[EvaluatorFunction] = [],
+        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
         run_evaluators: List[RunEvaluatorFunction] = [],
         max_concurrency: int = 50,
         metadata: Optional[Dict[str, Any]] = None,
@@ -234,6 +236,10 @@ def run_experiment(
                 .metadata attributes. Signature should be: task(*, item, **kwargs) -> Any
             evaluators: List of functions to evaluate each item's output individually.
                 These will have access to the item's expected_output for comparison.
+            composite_evaluator: Optional function that creates composite scores from item-level evaluations.
+                Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
+                plus the list of evaluations from item-level evaluators. Useful for weighted averages,
+                pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
             run_evaluators: List of functions to evaluate the entire experiment run.
                 Useful for computing aggregate statistics across all dataset items.
             max_concurrency: Maximum number of concurrent task executions (default: 50).
@@ -411,6 +417,7 @@ def content_diversity(*, item_results, **kwargs):
             data=self.items,
             task=task,
             evaluators=evaluators,
+            composite_evaluator=composite_evaluator,
             run_evaluators=run_evaluators,
             max_concurrency=max_concurrency,
             metadata=metadata,

diff --git a/langfuse/_client/observe.py b/langfuse/_client/observe.py
@@ -589,7 +589,9 @@ def __next__(self) -> Any:
             raise  # Re-raise StopIteration
 
         except Exception as e:
-            self.span.update(level="ERROR", status_message=str(e) or type(e).__name__).end()
+            self.span.update(
+                level="ERROR", status_message=str(e) or type(e).__name__
+            ).end()
 
             raise
 
@@ -654,6 +656,8 @@ async def __anext__(self) -> Any:
 
             raise  # Re-raise StopAsyncIteration
         except Exception as e:
-            self.span.update(level="ERROR", status_message=str(e) or type(e).__name__).end()
+            self.span.update(
+                level="ERROR", status_message=str(e) or type(e).__name__
+            ).end()
 
             raise