Skip to content
Merged
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ jobs:
- name: Setup node (for langfuse server)
uses: actions/setup-node@v3
with:
node-version: 20
node-version: 24

- name: Cache langfuse server dependencies
uses: actions/cache@v3
Expand Down
3 changes: 2 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.3.2
rev: v0.14.4
hooks:
# Run the linter and fix
- id: ruff
Expand All @@ -10,6 +10,7 @@ repos:
# Run the formatter.
- id: ruff-format
types_or: [python, pyi, jupyter]
args: [--config=ci.ruff.toml]

- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.18.2
Expand Down
14 changes: 14 additions & 0 deletions langfuse/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
""".. include:: ../README.md"""

from langfuse.batch_evaluation import (
BatchEvaluationResult,
BatchEvaluationResumeToken,
CompositeEvaluatorFunction,
EvaluatorInputs,
EvaluatorStats,
MapperFunction,
)
from langfuse.experiment import Evaluation

from ._client import client as _client_module
Expand Down Expand Up @@ -41,6 +49,12 @@
"LangfuseRetriever",
"LangfuseGuardrail",
"Evaluation",
"EvaluatorInputs",
"MapperFunction",
"CompositeEvaluatorFunction",
"EvaluatorStats",
"BatchEvaluationResumeToken",
"BatchEvaluationResult",
"experiment",
"api",
]
297 changes: 296 additions & 1 deletion langfuse/_client/client.py

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions langfuse/_client/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from opentelemetry.util._decorator import _agnosticcontextmanager

from langfuse.batch_evaluation import CompositeEvaluatorFunction
from langfuse.experiment import (
EvaluatorFunction,
ExperimentResult,
Expand Down Expand Up @@ -204,6 +205,7 @@ def run_experiment(
description: Optional[str] = None,
task: TaskFunction,
evaluators: List[EvaluatorFunction] = [],
composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
run_evaluators: List[RunEvaluatorFunction] = [],
max_concurrency: int = 50,
metadata: Optional[Dict[str, Any]] = None,
Expand Down Expand Up @@ -234,6 +236,10 @@ def run_experiment(
.metadata attributes. Signature should be: task(*, item, **kwargs) -> Any
evaluators: List of functions to evaluate each item's output individually.
These will have access to the item's expected_output for comparison.
composite_evaluator: Optional function that creates composite scores from item-level evaluations.
Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
plus the list of evaluations from item-level evaluators. Useful for weighted averages,
pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
run_evaluators: List of functions to evaluate the entire experiment run.
Useful for computing aggregate statistics across all dataset items.
max_concurrency: Maximum number of concurrent task executions (default: 50).
Expand Down Expand Up @@ -411,6 +417,7 @@ def content_diversity(*, item_results, **kwargs):
data=self.items,
task=task,
evaluators=evaluators,
composite_evaluator=composite_evaluator,
run_evaluators=run_evaluators,
max_concurrency=max_concurrency,
metadata=metadata,
Expand Down
8 changes: 6 additions & 2 deletions langfuse/_client/observe.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,7 +589,9 @@ def __next__(self) -> Any:
raise # Re-raise StopIteration

except Exception as e:
self.span.update(level="ERROR", status_message=str(e) or type(e).__name__).end()
self.span.update(
level="ERROR", status_message=str(e) or type(e).__name__
).end()

raise

Expand Down Expand Up @@ -654,6 +656,8 @@ async def __anext__(self) -> Any:

raise # Re-raise StopAsyncIteration
except Exception as e:
self.span.update(level="ERROR", status_message=str(e) or type(e).__name__).end()
self.span.update(
level="ERROR", status_message=str(e) or type(e).__name__
).end()

raise
Loading
Loading