From 7a2232a4d2e65996e22f47d3016c7ec4a358b173 Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Thu, 11 Sep 2025 11:24:26 +0200
Subject: [PATCH 01/25] feat(experiments): add experiment runner

---
 langfuse/_client/client.py   | 288 +++++++++++++++++++++++++++++++++++
 langfuse/_client/datasets.py |  71 ++++++++-
 2 files changed, 358 insertions(+), 1 deletion(-)

diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py
index df243e51c..7bfe2ac52 100644
--- a/langfuse/_client/client.py
+++ b/langfuse/_client/client.py
@@ -3,6 +3,7 @@
 This module implements Langfuse's core observability functionality on top of the OpenTelemetry (OTel) standard.
 """
 
+import asyncio
 import logging
 import os
 import re
@@ -13,6 +14,7 @@
 from time import time_ns
 from typing import (
     Any,
+    Callable,
     Dict,
     List,
     Literal,
@@ -44,6 +46,11 @@
     get_observation_types_list,
 )
 from langfuse._client.datasets import DatasetClient, DatasetItemClient
+from langfuse._client.experiments import (
+    ExperimentItem,
+    ExperimentItemResult,
+    ExperimentResult,
+)
 from langfuse._client.environment_variables import (
     LANGFUSE_DEBUG,
     LANGFUSE_HOST,
@@ -2444,6 +2451,287 @@ def get_dataset(
             handle_fern_exception(e)
             raise e
 
+    def run_experiment(
+        self,
+        *,
+        name: str,
+        description: Optional[str] = None,
+        data: Union[
+            List[Union[ExperimentItem, dict, DatasetItem]], List[DatasetItemClient]
+        ],
+        task: Callable[
+            [Union[ExperimentItem, dict, DatasetItem, DatasetItemClient]], Any
+        ],
+        evaluators: Optional[List[Callable]] = None,
+        run_evaluators: Optional[List[Callable]] = None,
+        max_concurrency: Optional[int] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> ExperimentResult:
+        """Run an experiment on a dataset with automatic tracing and evaluation.
+
+        This method executes a task function on each item in the provided dataset,
+        traces the execution with Langfuse, runs evaluators on the outputs,
+        and returns formatted results.
+
+        Args:
+            name: Human-readable name for the experiment
+            description: Optional description of the experiment's purpose
+            data: Array of data items to process (ExperimentItem or DatasetItem)
+            task: Function that processes each data item and returns output
+            evaluators: Optional list of functions to evaluate each item's output
+            run_evaluators: Optional list of functions to evaluate the entire experiment
+            max_concurrency: Maximum number of concurrent task executions
+            metadata: Optional metadata to attach to the experiment
+
+        Returns:
+            ExperimentResult containing item results, evaluations, and formatting functions
+
+        Example:
+            ```python
+            def task(item):
+                return f"Processed: {item['input']}"
+
+            def evaluator(*, input, output, expected_output=None, **kwargs):
+                return {"name": "length", "value": len(output)}
+
+            result = langfuse.run_experiment(
+                name="Test Experiment",
+                data=[{"input": "test", "expected_output": "expected"}],
+                task=task,
+                evaluators=[evaluator]
+            )
+
+            print(result["item_results"])
+            ```
+        """
+        return asyncio.run(
+            self._run_experiment_async(
+                name=name,
+                description=description,
+                data=data,
+                task=task,
+                evaluators=evaluators or [],
+                run_evaluators=run_evaluators or [],
+                max_concurrency=max_concurrency,
+                metadata=metadata or {},
+            )
+        )
+
+    async def _run_experiment_async(
+        self,
+        *,
+        name: str,
+        description: Optional[str],
+        data: Union[
+            List[Union[ExperimentItem, dict, DatasetItem]], List[DatasetItemClient]
+        ],
+        task: Callable,
+        evaluators: List[Callable],
+        run_evaluators: List[Callable],
+        max_concurrency: Optional[int],
+        metadata: Dict[str, Any],
+    ) -> ExperimentResult:
+        """Internal async implementation of run_experiment."""
+        from langfuse._client.experiments import _run_evaluator
+
+        langfuse_logger.debug(f"Starting experiment '{name}' with {len(data)} items")
+
+        # Set up concurrency control
+        max_workers = (
+            max_concurrency if max_concurrency is not None else min(len(data), 10)
+        )
+        semaphore = asyncio.Semaphore(max_workers)
+
+        # Process all items
+        async def process_item(
+            item: Union[ExperimentItem, dict, DatasetItem, DatasetItemClient],
+        ) -> dict:
+            async with semaphore:
+                return await self._process_experiment_item(
+                    item, task, evaluators, name, description, metadata
+                )
+
+        # Run all items concurrently
+        tasks = [process_item(item) for item in data]
+        item_results = await asyncio.gather(*tasks, return_exceptions=True)
+
+        # Filter out any exceptions and log errors
+        valid_results: List[ExperimentItemResult] = []
+        for i, result in enumerate(item_results):
+            if isinstance(result, Exception):
+                langfuse_logger.error(f"Item {i} failed: {result}")
+            elif isinstance(result, dict):
+                # Type-cast since we know the structure matches ExperimentItemResult
+                valid_results.append(result)  # type: ignore
+
+        # Run experiment-level evaluators
+        run_evaluations = []
+        for run_evaluator in run_evaluators:
+            try:
+                evaluations = await _run_evaluator(
+                    run_evaluator, item_results=valid_results
+                )
+                run_evaluations.extend(evaluations)
+            except Exception as e:
+                langfuse_logger.error(f"Run evaluator failed: {e}")
+
+        # Generate dataset run URL if applicable
+        dataset_run_id = (
+            valid_results[0].get("dataset_run_id") if valid_results else None
+        )
+        dataset_run_url = None
+        if dataset_run_id and data:
+            try:
+                # Check if the first item has dataset_id (for DatasetItem objects)
+                first_item = data[0]
+                dataset_id = None
+                if hasattr(first_item, "dataset_id"):
+                    dataset_id = getattr(first_item, "dataset_id", None)
+
+                if dataset_id:
+                    project_id = self._get_project_id()
+                    if project_id:
+                        dataset_run_url = f"{self._host}/project/{project_id}/datasets/{dataset_id}/runs/{dataset_run_id}"
+            except Exception:
+                pass  # URL generation is optional
+
+        # Store run-level evaluations as scores
+        for evaluation in run_evaluations:
+            try:
+                if dataset_run_id:
+                    self.create_score(
+                        dataset_run_id=dataset_run_id,
+                        name=evaluation["name"],
+                        value=evaluation["value"],
+                        comment=evaluation.get("comment"),
+                        metadata=evaluation.get("metadata"),
+                    )
+            except Exception as e:
+                langfuse_logger.error(f"Failed to store run evaluation: {e}")
+
+        return {
+            "item_results": valid_results,
+            "run_evaluations": run_evaluations,
+            "dataset_run_id": dataset_run_id,
+            "dataset_run_url": dataset_run_url,
+        }
+
+    async def _process_experiment_item(
+        self,
+        item: Union[ExperimentItem, dict, DatasetItem, DatasetItemClient],
+        task: Callable,
+        evaluators: List[Callable],
+        experiment_name: str,
+        experiment_description: Optional[str],
+        experiment_metadata: Dict[str, Any],
+    ) -> dict:
+        """Process a single experiment item with tracing and evaluation."""
+        from langfuse._client.experiments import _run_evaluator, _run_task
+
+        # Execute task with tracing
+        span_name = "experiment-item-run"
+        with self.start_as_current_span(name=span_name) as span:
+            try:
+                # Run the task
+                output = await _run_task(task, item)
+
+                # Update span with input/output
+                input_data = (
+                    item.get("input")
+                    if isinstance(item, dict)
+                    else getattr(item, "input", None)
+                )
+                # Prepare metadata
+                item_metadata: Dict[str, Any] = {}
+                if isinstance(item, dict):
+                    item_metadata = item.get("metadata", {}) or {}
+
+                final_metadata = {
+                    "experiment_name": experiment_name,
+                    **experiment_metadata,
+                }
+                if isinstance(item_metadata, dict):
+                    final_metadata.update(item_metadata)
+
+                span.update(
+                    input=input_data,
+                    output=output,
+                    metadata=final_metadata,
+                )
+
+                # Get trace ID for linking
+                trace_id = span.trace_id
+                dataset_run_id = None
+
+                # Link to dataset run if this is a dataset item
+                if hasattr(item, "id") and hasattr(item, "dataset_id"):
+                    try:
+                        from langfuse.model import CreateDatasetRunItemRequest
+
+                        dataset_run_item = self.api.dataset_run_items.create(
+                            request=CreateDatasetRunItemRequest(
+                                runName=experiment_name,
+                                runDescription=experiment_description,
+                                metadata=experiment_metadata,
+                                datasetItemId=item.id,  # type: ignore
+                                traceId=trace_id,
+                            )
+                        )
+                        dataset_run_id = dataset_run_item.dataset_run_id
+                    except Exception as e:
+                        langfuse_logger.error(f"Failed to create dataset run item: {e}")
+
+                # Run evaluators
+                evaluations = []
+                for evaluator in evaluators:
+                    try:
+                        expected_output = None
+                        if isinstance(item, dict):
+                            expected_output = item.get("expected_output")
+                        elif hasattr(item, "expected_output"):
+                            expected_output = item.expected_output
+
+                        eval_metadata: Optional[Dict[str, Any]] = None
+                        if isinstance(item, dict):
+                            eval_metadata = item.get("metadata")
+                        elif hasattr(item, "metadata"):
+                            eval_metadata = item.metadata
+
+                        eval_results = await _run_evaluator(
+                            evaluator,
+                            input=input_data,
+                            output=output,
+                            expected_output=expected_output,
+                            metadata=eval_metadata,
+                        )
+                        evaluations.extend(eval_results)
+
+                        # Store evaluations as scores
+                        for evaluation in eval_results:
+                            self.create_score(
+                                trace_id=trace_id,
+                                name=evaluation["name"],
+                                value=evaluation["value"],
+                                comment=evaluation.get("comment"),
+                                metadata=evaluation.get("metadata"),
+                            )
+                    except Exception as e:
+                        langfuse_logger.error(f"Evaluator failed: {e}")
+
+                return {
+                    "item": item,
+                    "output": output,
+                    "evaluations": evaluations,
+                    "trace_id": trace_id,
+                    "dataset_run_id": dataset_run_id,
+                }
+
+            except Exception as e:
+                span.update(
+                    output=f"Error: {str(e)}", level="ERROR", status_message=str(e)
+                )
+                raise e
+
     def auth_check(self) -> bool:
         """Check if the provided credentials (public and secret key) are valid.
 
diff --git a/langfuse/_client/datasets.py b/langfuse/_client/datasets.py
index f06570e57..4589d9d25 100644
--- a/langfuse/_client/datasets.py
+++ b/langfuse/_client/datasets.py
@@ -1,7 +1,7 @@
 import datetime as dt
 import logging
 from .span import LangfuseSpan
-from typing import TYPE_CHECKING, Any, Generator, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional
 
 from opentelemetry.util._decorator import _agnosticcontextmanager
 
@@ -181,3 +181,72 @@ def __init__(self, dataset: Dataset, items: List[DatasetItemClient]):
         self.created_at = dataset.created_at
         self.updated_at = dataset.updated_at
         self.items = items
+        self._langfuse: Optional["Langfuse"] = None
+
+    def _get_langfuse_client(self) -> Optional["Langfuse"]:
+        """Get the Langfuse client from the first item."""
+        if self._langfuse is None and self.items:
+            self._langfuse = self.items[0].langfuse
+        return self._langfuse
+
+    def run_experiment(
+        self,
+        *,
+        name: str,
+        description: Optional[str] = None,
+        task: Any,
+        evaluators: Optional[List[Any]] = None,
+        run_evaluators: Optional[List[Any]] = None,
+        max_concurrency: Optional[int] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> Any:
+        """Run an experiment on this dataset.
+
+        This is a convenience method that calls the Langfuse client's run_experiment
+        method with this dataset's items as the data.
+
+        Args:
+            name: Human-readable name for the experiment
+            description: Optional description of the experiment's purpose
+            task: Function that processes each data item and returns output
+            evaluators: Optional list of functions to evaluate each item's output
+            run_evaluators: Optional list of functions to evaluate the entire experiment
+            max_concurrency: Maximum number of concurrent task executions
+            metadata: Optional metadata to attach to the experiment
+
+        Returns:
+            ExperimentResult containing item results, evaluations, and formatting functions
+
+        Example:
+            ```python
+            dataset = langfuse.get_dataset("my-dataset")
+
+            def task(item):
+                return f"Processed: {item.input}"
+
+            def evaluator(*, input, output, expected_output=None, **kwargs):
+                return {"name": "length", "value": len(output)}
+
+            result = dataset.run_experiment(
+                name="Dataset Test Experiment",
+                task=task,
+                evaluators=[evaluator]
+            )
+
+            print(result["item_results"])
+            ```
+        """
+        langfuse_client = self._get_langfuse_client()
+        if not langfuse_client:
+            raise ValueError("No Langfuse client available. Dataset items are empty.")
+
+        return langfuse_client.run_experiment(
+            name=name,
+            description=description,
+            data=self.items,
+            task=task,
+            evaluators=evaluators,
+            run_evaluators=run_evaluators,
+            max_concurrency=max_concurrency,
+            metadata=metadata,
+        )

From 2cbf43b881d69c27ea6e669179820a4bcbceb8ca Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Thu, 11 Sep 2025 12:15:29 +0200
Subject: [PATCH 02/25] push

---
 langfuse/_client/experiments.py | 324 ++++++++++++++++++++
 tests/test_experiments.py       | 520 ++++++++++++++++++++++++++++++++
 2 files changed, 844 insertions(+)
 create mode 100644 langfuse/_client/experiments.py
 create mode 100644 tests/test_experiments.py

diff --git a/langfuse/_client/experiments.py b/langfuse/_client/experiments.py
new file mode 100644
index 000000000..db27153e0
--- /dev/null
+++ b/langfuse/_client/experiments.py
@@ -0,0 +1,324 @@
+"""Langfuse experiment functionality for running and evaluating tasks on datasets.
+
+This module provides the core experiment functionality for the Langfuse Python SDK,
+allowing users to run experiments on datasets with automatic tracing, evaluation,
+and result formatting.
+"""
+
+import asyncio
+import logging
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Awaitable,
+    Dict,
+    List,
+    Optional,
+    Protocol,
+    TypedDict,
+    Union,
+)
+
+from langfuse.model import DatasetItem
+
+if TYPE_CHECKING:
+    from langfuse._client.datasets import DatasetItemClient
+
+
+class ExperimentItem(TypedDict, total=False):
+    """Structure for experiment data items.
+
+    Args:
+        input: The input data to pass to the task function
+        expected_output: Optional expected output for evaluation purposes
+        metadata: Optional metadata for the experiment item
+    """
+
+    input: Any
+    expected_output: Any
+    metadata: Optional[Dict[str, Any]]
+
+
+class Evaluation(TypedDict, total=False):
+    """Structure for evaluation results.
+
+    Args:
+        name: Name of the evaluation metric
+        value: The evaluation score/value (numeric or string)
+        comment: Optional comment explaining the evaluation
+        metadata: Optional metadata for the evaluation
+    """
+
+    name: str
+    value: Union[int, float, str, bool]
+    comment: Optional[str]
+    metadata: Optional[Dict[str, Any]]
+
+
+class ExperimentItemResult(TypedDict):
+    """Result structure for individual experiment items.
+
+    Args:
+        item: The original experiment item that was processed
+        output: The actual output produced by the task
+        evaluations: List of evaluation results for this item
+        trace_id: Langfuse trace ID for this item's execution
+        dataset_run_id: Dataset run ID if this item was part of a Langfuse dataset
+    """
+
+    item: Union[ExperimentItem, DatasetItem]
+    output: Any
+    evaluations: List[Evaluation]
+    trace_id: Optional[str]
+    dataset_run_id: Optional[str]
+
+
+class ExperimentResult(TypedDict):
+    """Complete result structure for experiment execution.
+
+    Args:
+        item_results: Results from processing each individual data item
+        run_evaluations: Results from run-level evaluators
+        dataset_run_id: ID of the dataset run (if using Langfuse datasets)
+        dataset_run_url: URL to view the dataset run in Langfuse UI
+    """
+
+    item_results: List[ExperimentItemResult]
+    run_evaluations: List[Evaluation]
+    dataset_run_id: Optional[str]
+    dataset_run_url: Optional[str]
+
+
+class TaskFunction(Protocol):
+    """Protocol for experiment task functions."""
+
+    def __call__(
+        self, item: Union[ExperimentItem, dict, DatasetItem, "DatasetItemClient"]
+    ) -> Union[Any, Awaitable[Any]]:
+        """Execute the task on an experiment item.
+
+        Args:
+            item: The experiment or dataset item to process
+
+        Returns:
+            The task output (can be sync or async)
+        """
+        ...
+
+
+class EvaluatorFunction(Protocol):
+    """Protocol for item-level evaluator functions."""
+
+    def __call__(
+        self,
+        *,
+        input: Any,
+        output: Any,
+        expected_output: Any = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> Union[
+        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
+    ]:
+        """Evaluate a task output.
+
+        Args:
+            input: The original input to the task
+            output: The output produced by the task
+            expected_output: The expected output (if available)
+            metadata: Optional metadata from the experiment item
+
+        Returns:
+            Single evaluation or list of evaluations (can be sync or async)
+        """
+        ...
+
+
+class RunEvaluatorFunction(Protocol):
+    """Protocol for run-level evaluator functions."""
+
+    def __call__(
+        self, *, item_results: List[ExperimentItemResult]
+    ) -> Union[
+        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
+    ]:
+        """Evaluate the entire experiment run.
+
+        Args:
+            item_results: Results from all processed experiment items
+
+        Returns:
+            Single evaluation or list of evaluations (can be sync or async)
+        """
+        ...
+
+
+def format_experiment_results(
+    item_results: List[ExperimentItemResult],
+    run_evaluations: List[Evaluation],
+    experiment_name: str,
+    experiment_description: Optional[str] = None,
+    dataset_run_url: Optional[str] = None,
+    include_item_results: bool = False,
+) -> str:
+    """Format experiment results for display.
+
+    Args:
+        item_results: Results from processing each item
+        run_evaluations: Results from run-level evaluators
+        experiment_name: Name of the experiment
+        experiment_description: Optional description of the experiment
+        dataset_run_url: Optional URL to dataset run in Langfuse UI
+        include_item_results: Whether to include individual item details
+
+    Returns:
+        Formatted string representation of the results
+    """
+    if not item_results:
+        return "No experiment results to display."
+
+    output = ""
+
+    # Individual results
+    if include_item_results:
+        for i, result in enumerate(item_results):
+            output += f"\n{i + 1}. Item {i + 1}:\n"
+
+            # Input, expected, and actual
+            item_input = None
+            if isinstance(result["item"], dict):
+                item_input = result["item"].get("input")
+            elif hasattr(result["item"], "input"):
+                item_input = result["item"].input
+
+            if item_input is not None:
+                output += f"   Input:    {_format_value(item_input)}\n"
+
+            expected_output = None
+            if isinstance(result["item"], dict):
+                expected_output = result["item"].get("expected_output")
+            elif hasattr(result["item"], "expected_output"):
+                expected_output = result["item"].expected_output
+
+            if expected_output is not None:
+                output += f"   Expected: {_format_value(expected_output)}\n"
+            output += f"   Actual:   {_format_value(result['output'])}\n"
+
+            # Scores
+            if result["evaluations"]:
+                output += "   Scores:\n"
+                for evaluation in result["evaluations"]:
+                    score = evaluation["value"]
+                    if isinstance(score, (int, float)):
+                        score = f"{score:.3f}"
+                    output += f"     • {evaluation['name']}: {score}"
+                    if evaluation.get("comment"):
+                        output += f"\n       💭 {evaluation['comment']}"
+                    output += "\n"
+
+            # Trace link
+            if result.get("trace_id"):
+                # Note: We'd need the langfuse client to generate the actual URL
+                output += f"\n   Trace ID: {result['trace_id']}\n"
+    else:
+        output += f"Individual Results: Hidden ({len(item_results)} items)\n"
+        output += "💡 Set include_item_results=True to view them\n"
+
+    # Experiment Overview
+    output += f"\n{'─' * 50}\n"
+    output += f"📊 {experiment_name}"
+    if experiment_description:
+        output += f" - {experiment_description}"
+
+    output += f"\n{len(item_results)} items"
+
+    # Get unique evaluation names
+    evaluation_names = set()
+    for result in item_results:
+        for evaluation in result["evaluations"]:
+            evaluation_names.add(evaluation["name"])
+
+    if evaluation_names:
+        output += "\nEvaluations:"
+        for eval_name in evaluation_names:
+            output += f"\n  • {eval_name}"
+        output += "\n"
+
+    # Average scores
+    if evaluation_names:
+        output += "\nAverage Scores:"
+        for eval_name in evaluation_names:
+            scores = []
+            for result in item_results:
+                for evaluation in result["evaluations"]:
+                    if evaluation["name"] == eval_name and isinstance(
+                        evaluation["value"], (int, float)
+                    ):
+                        scores.append(evaluation["value"])
+
+            if scores:
+                avg = sum(scores) / len(scores)
+                output += f"\n  • {eval_name}: {avg:.3f}"
+        output += "\n"
+
+    # Run evaluations
+    if run_evaluations:
+        output += "\nRun Evaluations:"
+        for run_eval in run_evaluations:
+            score = run_eval["value"]
+            if isinstance(score, (int, float)):
+                score = f"{score:.3f}"
+            output += f"\n  • {run_eval['name']}: {score}"
+            if run_eval.get("comment"):
+                output += f"\n    💭 {run_eval['comment']}"
+        output += "\n"
+
+    if dataset_run_url:
+        output += f"\n🔗 Dataset Run:\n   {dataset_run_url}"
+
+    return output
+
+
+def _format_value(value: Any) -> str:
+    """Format a value for display."""
+    if isinstance(value, str):
+        return value[:50] + "..." if len(value) > 50 else value
+    return str(value)
+
+
+async def _run_evaluator(
+    evaluator: EvaluatorFunction, **kwargs: Any
+) -> List[Evaluation]:
+    """Run an evaluator function and normalize the result."""
+    try:
+        result = evaluator(**kwargs)
+
+        # Handle async evaluators
+        if asyncio.iscoroutine(result):
+            result = await result
+
+        # Normalize to list
+        if isinstance(result, dict):
+            return [result]
+        elif isinstance(result, list):
+            return result
+        else:
+            return []
+
+    except Exception as e:
+        evaluator_name = getattr(evaluator, "__name__", "unknown_evaluator")
+        logging.getLogger("langfuse").error(f"Evaluator {evaluator_name} failed: {e}")
+        return []
+
+
+async def _run_task(
+    task: TaskFunction,
+    item: Union[ExperimentItem, dict, DatasetItem, "DatasetItemClient"],
+) -> Any:
+    """Run a task function and handle sync/async."""
+    result = task(item)
+
+    # Handle async tasks
+    if asyncio.iscoroutine(result):
+        result = await result
+
+    return result
diff --git a/tests/test_experiments.py b/tests/test_experiments.py
new file mode 100644
index 000000000..4384001f4
--- /dev/null
+++ b/tests/test_experiments.py
@@ -0,0 +1,520 @@
+"""Comprehensive tests for Langfuse experiment functionality matching JS SDK."""
+
+import time
+
+import pytest
+
+from langfuse import get_client
+from tests.utils import create_uuid, get_api
+
+
+@pytest.fixture
+def sample_dataset():
+    """Sample dataset for experiments."""
+    return [
+        {"input": "Germany", "expected_output": "Berlin"},
+        {"input": "France", "expected_output": "Paris"},
+        {"input": "Spain", "expected_output": "Madrid"},
+    ]
+
+
+def mock_task(item):
+    """Mock task function that simulates processing."""
+    input_val = (
+        item.get("input")
+        if isinstance(item, dict)
+        else getattr(item, "input", "unknown")
+    )
+    return f"Capital of {input_val}"
+
+
+def simple_evaluator(*, input, output, expected_output=None, **kwargs):
+    """Simple evaluator that returns output length."""
+    return {"name": "length_check", "value": len(output)}
+
+
+def factuality_evaluator(*, input, output, expected_output=None, **kwargs):
+    """Mock factuality evaluator."""
+    # Simple mock: check if expected output is in the output
+    if expected_output and expected_output.lower() in output.lower():
+        return {"name": "factuality", "value": 1.0, "comment": "Correct answer found"}
+    return {"name": "factuality", "value": 0.0, "comment": "Incorrect answer"}
+
+
+def run_evaluator_average_length(*, item_results, **kwargs):
+    """Run evaluator that calculates average output length."""
+    if not item_results:
+        return {"name": "average_length", "value": 0}
+
+    avg_length = sum(len(r["output"]) for r in item_results) / len(item_results)
+    return {"name": "average_length", "value": avg_length}
+
+
+# Basic Functionality Tests
+def test_run_experiment_on_local_dataset(sample_dataset):
+    """Test running experiment on local dataset."""
+    langfuse_client = get_client()
+    result = langfuse_client.run_experiment(
+        name="Euro capitals",
+        description="Country capital experiment",
+        data=sample_dataset,
+        task=mock_task,
+        evaluators=[simple_evaluator, factuality_evaluator],
+        run_evaluators=[run_evaluator_average_length],
+    )
+
+    # Validate basic result structure
+    assert len(result["item_results"]) == 3
+    assert len(result["run_evaluations"]) == 1
+    assert result["run_evaluations"][0]["name"] == "average_length"
+    assert result["dataset_run_id"] is None  # No dataset_run_id for local datasets
+
+    # Validate item results structure
+    for item_result in result["item_results"]:
+        assert "output" in item_result
+        assert "evaluations" in item_result
+        assert "trace_id" in item_result
+        assert (
+            item_result["dataset_run_id"] is None
+        )  # No dataset_run_id for local datasets
+        assert len(item_result["evaluations"]) == 2  # Both evaluators should run
+
+    # Flush and wait for server processing
+    langfuse_client.flush()
+    time.sleep(2)
+
+
+def test_run_experiment_on_langfuse_dataset():
+    """Test running experiment on Langfuse dataset."""
+    langfuse_client = get_client()
+    # Create dataset
+    dataset_name = "test-dataset-" + create_uuid()
+    langfuse_client.create_dataset(name=dataset_name)
+
+    # Add items to dataset
+    test_items = [
+        {"input": "Germany", "expected_output": "Berlin"},
+        {"input": "France", "expected_output": "Paris"},
+    ]
+
+    for item in test_items:
+        langfuse_client.create_dataset_item(
+            dataset_name=dataset_name,
+            input=item["input"],
+            expected_output=item["expected_output"],
+        )
+
+    # Get dataset and run experiment
+    dataset = langfuse_client.get_dataset(dataset_name)
+
+    result = dataset.run_experiment(
+        name="Dataset Test",
+        description="Test on Langfuse dataset",
+        task=mock_task,
+        evaluators=[factuality_evaluator],
+    )
+
+    # Should have dataset run ID for Langfuse datasets
+    assert result["dataset_run_id"] is not None
+    assert len(result["item_results"]) == 2
+    assert all(item["dataset_run_id"] is not None for item in result["item_results"])
+
+    # Flush and wait for server processing
+    langfuse_client.flush()
+    time.sleep(3)
+
+    # Verify dataset run exists via API
+    api = get_api()
+    runs = api.datasets.get_runs(dataset_name)
+    assert len(runs.data) >= 1
+
+
+# Error Handling Tests
+def test_evaluator_failures_handled_gracefully():
+    """Test that evaluator failures don't break the experiment."""
+    langfuse_client = get_client()
+
+    def failing_evaluator(**kwargs):
+        raise Exception("Evaluator failed")
+
+    def working_evaluator(**kwargs):
+        return {"name": "working_eval", "value": 1.0}
+
+    result = langfuse_client.run_experiment(
+        name="Error test",
+        data=[{"input": "test"}],
+        task=lambda x: "result",
+        evaluators=[working_evaluator, failing_evaluator],
+    )
+
+    # Should complete with only working evaluator
+    assert len(result["item_results"]) == 1
+    # Only the working evaluator should have produced results
+    assert (
+        len(
+            [
+                eval
+                for eval in result["item_results"][0]["evaluations"]
+                if eval["name"] == "working_eval"
+            ]
+        )
+        == 1
+    )
+
+    langfuse_client.flush()
+    time.sleep(1)
+
+
+def test_task_failures_handled_gracefully():
+    """Test that task failures are handled gracefully and don't stop the experiment."""
+    langfuse_client = get_client()
+
+    def failing_task(item):
+        raise Exception("Task failed")
+
+    def working_task(item):
+        return f"Processed: {item['input']}"
+
+    # Test with mixed data - some will fail, some will succeed
+    result = langfuse_client.run_experiment(
+        name="Task error test",
+        data=[{"input": "test1"}, {"input": "test2"}],
+        task=failing_task,
+    )
+
+    # Should complete but with no valid results since all tasks failed
+    assert len(result["item_results"]) == 0
+
+    langfuse_client.flush()
+    time.sleep(1)
+
+
+def test_run_evaluator_failures_handled():
+    """Test that run evaluator failures don't break the experiment."""
+    langfuse_client = get_client()
+
+    def failing_run_evaluator(**kwargs):
+        raise Exception("Run evaluator failed")
+
+    result = langfuse_client.run_experiment(
+        name="Run evaluator error test",
+        data=[{"input": "test"}],
+        task=lambda x: "result",
+        run_evaluators=[failing_run_evaluator],
+    )
+
+    # Should complete but run evaluations should be empty
+    assert len(result["item_results"]) == 1
+    assert len(result["run_evaluations"]) == 0
+
+    langfuse_client.flush()
+    time.sleep(1)
+
+
+# Edge Cases Tests
+def test_empty_dataset_handling():
+    """Test experiment with empty dataset."""
+    langfuse_client = get_client()
+
+    result = langfuse_client.run_experiment(
+        name="Empty dataset test",
+        data=[],
+        task=lambda x: "result",
+        run_evaluators=[run_evaluator_average_length],
+    )
+
+    assert len(result["item_results"]) == 0
+    assert len(result["run_evaluations"]) == 1  # Run evaluators still execute
+
+    langfuse_client.flush()
+    time.sleep(1)
+
+
+def test_dataset_with_missing_fields():
+    """Test handling dataset with missing fields."""
+    langfuse_client = get_client()
+
+    incomplete_dataset = [
+        {"input": "Germany"},  # Missing expected_output
+        {"expected_output": "Paris"},  # Missing input
+        {"input": "Spain", "expected_output": "Madrid"},  # Complete
+    ]
+
+    result = langfuse_client.run_experiment(
+        name="Incomplete data test",
+        data=incomplete_dataset,
+        task=lambda x: "result",
+    )
+
+    # Should handle missing fields gracefully
+    assert len(result["item_results"]) == 3
+    for item_result in result["item_results"]:
+        assert "trace_id" in item_result
+        assert "output" in item_result
+
+    langfuse_client.flush()
+    time.sleep(1)
+
+
+def test_large_dataset_with_concurrency():
+    """Test handling large dataset with concurrency control."""
+    langfuse_client = get_client()
+
+    large_dataset = [
+        {"input": f"Item {i}", "expected_output": f"Output {i}"} for i in range(20)
+    ]
+
+    result = langfuse_client.run_experiment(
+        name="Large dataset test",
+        data=large_dataset,
+        task=lambda x: f"Processed {x['input']}",
+        evaluators=[lambda **kwargs: {"name": "simple_eval", "value": 1.0}],
+        max_concurrency=5,
+    )
+
+    assert len(result["item_results"]) == 20
+    for item_result in result["item_results"]:
+        assert len(item_result["evaluations"]) == 1
+        assert "trace_id" in item_result
+
+    langfuse_client.flush()
+    time.sleep(3)
+
+
+# Evaluator Configuration Tests
+def test_single_evaluation_return():
+    """Test evaluators returning single evaluation instead of array."""
+    langfuse_client = get_client()
+
+    def single_evaluator(**kwargs):
+        return {"name": "single_eval", "value": 1, "comment": "Single evaluation"}
+
+    result = langfuse_client.run_experiment(
+        name="Single evaluation test",
+        data=[{"input": "test"}],
+        task=lambda x: "result",
+        evaluators=[single_evaluator],
+    )
+
+    assert len(result["item_results"]) == 1
+    assert len(result["item_results"][0]["evaluations"]) == 1
+    assert result["item_results"][0]["evaluations"][0]["name"] == "single_eval"
+
+    langfuse_client.flush()
+    time.sleep(1)
+
+
+def test_no_evaluators():
+    """Test experiment with no evaluators."""
+    langfuse_client = get_client()
+
+    result = langfuse_client.run_experiment(
+        name="No evaluators test",
+        data=[{"input": "test"}],
+        task=lambda x: "result",
+        evaluators=[],
+    )
+
+    assert len(result["item_results"]) == 1
+    assert len(result["item_results"][0]["evaluations"]) == 0
+    assert len(result["run_evaluations"]) == 0
+
+    langfuse_client.flush()
+    time.sleep(1)
+
+
+def test_only_run_evaluators():
+    """Test experiment with only run evaluators."""
+    langfuse_client = get_client()
+
+    def run_only_evaluator(**kwargs):
+        return {
+            "name": "run_only_eval",
+            "value": 10,
+            "comment": "Run-level evaluation",
+        }
+
+    result = langfuse_client.run_experiment(
+        name="Only run evaluators test",
+        data=[{"input": "test"}],
+        task=lambda x: "result",
+        evaluators=[],
+        run_evaluators=[run_only_evaluator],
+    )
+
+    assert len(result["item_results"]) == 1
+    assert len(result["item_results"][0]["evaluations"]) == 0  # No item evaluations
+    assert len(result["run_evaluations"]) == 1
+    assert result["run_evaluations"][0]["name"] == "run_only_eval"
+
+    langfuse_client.flush()
+    time.sleep(1)
+
+
+def test_different_data_types():
+    """Test evaluators returning different data types."""
+    langfuse_client = get_client()
+
+    def number_evaluator(**kwargs):
+        return {"name": "number_eval", "value": 42}
+
+    def string_evaluator(**kwargs):
+        return {"name": "string_eval", "value": "excellent"}
+
+    def boolean_evaluator(**kwargs):
+        return {"name": "boolean_eval", "value": True}
+
+    result = langfuse_client.run_experiment(
+        name="Different data types test",
+        data=[{"input": "test"}],
+        task=lambda x: "result",
+        evaluators=[number_evaluator, string_evaluator, boolean_evaluator],
+    )
+
+    evaluations = result["item_results"][0]["evaluations"]
+    assert len(evaluations) == 3
+
+    eval_by_name = {e["name"]: e["value"] for e in evaluations}
+    assert eval_by_name["number_eval"] == 42
+    assert eval_by_name["string_eval"] == "excellent"
+    assert eval_by_name["boolean_eval"] is True
+
+    langfuse_client.flush()
+    time.sleep(1)
+
+
+# Data Persistence Tests
+def test_scores_are_persisted():
+    """Test that scores are properly persisted to the database."""
+    langfuse_client = get_client()
+
+    # Create dataset
+    dataset_name = "score-persistence-" + create_uuid()
+    langfuse_client.create_dataset(name=dataset_name)
+
+    langfuse_client.create_dataset_item(
+        dataset_name=dataset_name,
+        input="Test input",
+        expected_output="Test output",
+    )
+
+    dataset = langfuse_client.get_dataset(dataset_name)
+
+    def test_evaluator(**kwargs):
+        return {
+            "name": "persistence_test",
+            "value": 0.85,
+            "comment": "Test evaluation for persistence",
+        }
+
+    def test_run_evaluator(**kwargs):
+        return {
+            "name": "persistence_run_test",
+            "value": 0.9,
+            "comment": "Test run evaluation for persistence",
+        }
+
+    result = dataset.run_experiment(
+        name="Score persistence test",
+        description="Test score persistence",
+        task=mock_task,
+        evaluators=[test_evaluator],
+        run_evaluators=[test_run_evaluator],
+    )
+
+    assert result["dataset_run_id"] is not None
+    assert len(result["item_results"]) == 1
+    assert len(result["run_evaluations"]) == 1
+
+    langfuse_client.flush()
+    time.sleep(3)
+
+    # Verify scores are persisted via API
+    api = get_api()
+    runs = api.datasets.get_runs(dataset_name)
+    assert len(runs.data) >= 1
+
+    # Verify the run exists with correct name
+    run_names = [run.name for run in runs.data]
+    assert "Score persistence test" in run_names
+
+
+def test_multiple_experiments_on_same_dataset():
+    """Test running multiple experiments on the same dataset."""
+    langfuse_client = get_client()
+
+    # Create dataset
+    dataset_name = "multi-experiment-" + create_uuid()
+    langfuse_client.create_dataset(name=dataset_name)
+
+    for item in [
+        {"input": "Germany", "expected_output": "Berlin"},
+        {"input": "France", "expected_output": "Paris"},
+    ]:
+        langfuse_client.create_dataset_item(
+            dataset_name=dataset_name,
+            input=item["input"],
+            expected_output=item["expected_output"],
+        )
+
+    dataset = langfuse_client.get_dataset(dataset_name)
+
+    # Run first experiment
+    result1 = dataset.run_experiment(
+        name="Experiment 1",
+        description="First experiment",
+        task=mock_task,
+        evaluators=[factuality_evaluator],
+    )
+
+    langfuse_client.flush()
+    time.sleep(2)
+
+    # Run second experiment
+    result2 = dataset.run_experiment(
+        name="Experiment 2",
+        description="Second experiment",
+        task=mock_task,
+        evaluators=[simple_evaluator],
+    )
+
+    langfuse_client.flush()
+    time.sleep(2)
+
+    # Both experiments should have different run IDs
+    assert result1["dataset_run_id"] is not None
+    assert result2["dataset_run_id"] is not None
+    assert result1["dataset_run_id"] != result2["dataset_run_id"]
+
+    # Verify both runs exist in database
+    api = get_api()
+    runs = api.datasets.get_runs(dataset_name)
+    assert len(runs.data) >= 2
+
+    run_names = [run.name for run in runs.data]
+    assert "Experiment 1" in run_names
+    assert "Experiment 2" in run_names
+
+
+# Result Formatting Tests
+def test_format_experiment_results_basic():
+    """Test basic result formatting functionality."""
+    langfuse_client = get_client()
+
+    result = langfuse_client.run_experiment(
+        name="Formatting test",
+        description="Test result formatting",
+        data=[{"input": "Hello", "expected_output": "Hi"}],
+        task=lambda x: f"Processed: {x['input']}",
+        evaluators=[simple_evaluator],
+        run_evaluators=[run_evaluator_average_length],
+    )
+
+    # Basic validation that result structure is correct for formatting
+    assert len(result["item_results"]) == 1
+    assert len(result["run_evaluations"]) == 1
+    assert "trace_id" in result["item_results"][0]
+    assert "evaluations" in result["item_results"][0]
+
+    langfuse_client.flush()
+    time.sleep(1)

From 9eee51d9449ffc0dda664a957a330b9a18f21ff9 Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Thu, 11 Sep 2025 15:51:00 +0200
Subject: [PATCH 03/25] push

---
 langfuse/_client/client.py | 71 +++++++++++++++++++++++---------------
 tests/test_experiments.py  |  1 +
 2 files changed, 45 insertions(+), 27 deletions(-)

diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py
index 7bfe2ac52..6f3722990 100644
--- a/langfuse/_client/client.py
+++ b/langfuse/_client/client.py
@@ -46,11 +46,6 @@
     get_observation_types_list,
 )
 from langfuse._client.datasets import DatasetClient, DatasetItemClient
-from langfuse._client.experiments import (
-    ExperimentItem,
-    ExperimentItemResult,
-    ExperimentResult,
-)
 from langfuse._client.environment_variables import (
     LANGFUSE_DEBUG,
     LANGFUSE_HOST,
@@ -61,6 +56,13 @@
     LANGFUSE_TRACING_ENABLED,
     LANGFUSE_TRACING_ENVIRONMENT,
 )
+from langfuse._client.experiments import (
+    ExperimentItem,
+    ExperimentItemResult,
+    ExperimentResult,
+    _run_evaluator,
+    _run_task,
+)
 from langfuse._client.resource_manager import LangfuseResourceManager
 from langfuse._client.span import (
     LangfuseAgent,
@@ -742,7 +744,7 @@ def start_generation(
         cost_details: Optional[Dict[str, float]] = None,
         prompt: Optional[PromptClient] = None,
     ) -> LangfuseGeneration:
-        """[DEPRECATED] Create a new generation span for model generations.
+        """Create a new generation span for model generations.
 
         DEPRECATED: This method is deprecated and will be removed in a future version.
         Use start_observation(as_type='generation') instead.
@@ -838,7 +840,7 @@ def start_as_current_generation(
         prompt: Optional[PromptClient] = None,
         end_on_exit: Optional[bool] = None,
     ) -> _AgnosticContextManager[LangfuseGeneration]:
-        """[DEPRECATED] Create a new generation span and set it as the current span in a context manager.
+        """Create a new generation span and set it as the current span in a context manager.
 
         DEPRECATED: This method is deprecated and will be removed in a future version.
         Use start_as_current_observation(as_type='generation') instead.
@@ -2531,9 +2533,6 @@ async def _run_experiment_async(
         max_concurrency: Optional[int],
         metadata: Dict[str, Any],
     ) -> ExperimentResult:
-        """Internal async implementation of run_experiment."""
-        from langfuse._client.experiments import _run_evaluator
-
         langfuse_logger.debug(f"Starting experiment '{name}' with {len(data)} items")
 
         # Set up concurrency control
@@ -2561,7 +2560,6 @@ async def process_item(
             if isinstance(result, Exception):
                 langfuse_logger.error(f"Item {i} failed: {result}")
             elif isinstance(result, dict):
-                # Type-cast since we know the structure matches ExperimentItemResult
                 valid_results.append(result)  # type: ignore
 
         # Run experiment-level evaluators
@@ -2585,13 +2583,16 @@ async def process_item(
                 # Check if the first item has dataset_id (for DatasetItem objects)
                 first_item = data[0]
                 dataset_id = None
+
                 if hasattr(first_item, "dataset_id"):
                     dataset_id = getattr(first_item, "dataset_id", None)
 
                 if dataset_id:
                     project_id = self._get_project_id()
+
                     if project_id:
                         dataset_run_url = f"{self._host}/project/{project_id}/datasets/{dataset_id}/runs/{dataset_run_id}"
+
             except Exception:
                 pass  # URL generation is optional
 
@@ -2606,6 +2607,7 @@ async def process_item(
                         comment=evaluation.get("comment"),
                         metadata=evaluation.get("metadata"),
                     )
+
             except Exception as e:
                 langfuse_logger.error(f"Failed to store run evaluation: {e}")
 
@@ -2625,31 +2627,38 @@ async def _process_experiment_item(
         experiment_description: Optional[str],
         experiment_metadata: Dict[str, Any],
     ) -> dict:
-        """Process a single experiment item with tracing and evaluation."""
-        from langfuse._client.experiments import _run_evaluator, _run_task
-
         # Execute task with tracing
         span_name = "experiment-item-run"
+
         with self.start_as_current_span(name=span_name) as span:
             try:
-                # Run the task
                 output = await _run_task(task, item)
 
-                # Update span with input/output
                 input_data = (
                     item.get("input")
                     if isinstance(item, dict)
                     else getattr(item, "input", None)
                 )
-                # Prepare metadata
+
                 item_metadata: Dict[str, Any] = {}
+
                 if isinstance(item, dict):
-                    item_metadata = item.get("metadata", {}) or {}
+                    item_metadata = item.get("metadata", None) or {}
 
                 final_metadata = {
                     "experiment_name": experiment_name,
                     **experiment_metadata,
                 }
+
+                if (
+                    not isinstance(item, dict)
+                    and hasattr(item, "dataset_id")
+                    and hasattr(item, "id")
+                ):
+                    final_metadata.update(
+                        {"dataset_id": item.dataset_id, "dataset_item_id": item.id}
+                    )
+
                 if isinstance(item_metadata, dict):
                     final_metadata.update(item_metadata)
 
@@ -2668,30 +2677,37 @@ async def _process_experiment_item(
                     try:
                         from langfuse.model import CreateDatasetRunItemRequest
 
-                        dataset_run_item = self.api.dataset_run_items.create(
-                            request=CreateDatasetRunItemRequest(
-                                runName=experiment_name,
-                                runDescription=experiment_description,
-                                metadata=experiment_metadata,
-                                datasetItemId=item.id,  # type: ignore
-                                traceId=trace_id,
+                        dataset_run_item = (
+                            await self.async_api.dataset_run_items.create(
+                                request=CreateDatasetRunItemRequest(
+                                    runName=experiment_name,
+                                    runDescription=experiment_description,
+                                    metadata=experiment_metadata,
+                                    datasetItemId=item.id,  # type: ignore
+                                    traceId=trace_id,
+                                )
                             )
                         )
+
                         dataset_run_id = dataset_run_item.dataset_run_id
+
                     except Exception as e:
                         langfuse_logger.error(f"Failed to create dataset run item: {e}")
 
                 # Run evaluators
                 evaluations = []
+
                 for evaluator in evaluators:
                     try:
                         expected_output = None
+
                         if isinstance(item, dict):
                             expected_output = item.get("expected_output")
                         elif hasattr(item, "expected_output"):
                             expected_output = item.expected_output
 
                         eval_metadata: Optional[Dict[str, Any]] = None
+
                         if isinstance(item, dict):
                             eval_metadata = item.get("metadata")
                         elif hasattr(item, "metadata"):
@@ -2710,11 +2726,12 @@ async def _process_experiment_item(
                         for evaluation in eval_results:
                             self.create_score(
                                 trace_id=trace_id,
-                                name=evaluation["name"],
-                                value=evaluation["value"],
+                                name=evaluation.get("name", "unknown"),
+                                value=evaluation.get("value", -1),
                                 comment=evaluation.get("comment"),
                                 metadata=evaluation.get("metadata"),
                             )
+
                     except Exception as e:
                         langfuse_logger.error(f"Evaluator failed: {e}")
 
diff --git a/tests/test_experiments.py b/tests/test_experiments.py
index 4384001f4..86cf0845c 100644
--- a/tests/test_experiments.py
+++ b/tests/test_experiments.py
@@ -112,6 +112,7 @@ def test_run_experiment_on_langfuse_dataset():
         description="Test on Langfuse dataset",
         task=mock_task,
         evaluators=[factuality_evaluator],
+        run_evaluators=[run_evaluator_average_length],
     )
 
     # Should have dataset run ID for Langfuse datasets

From 00565f698039bb93fdd9547950231f978134ecca Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Thu, 11 Sep 2025 17:07:42 +0200
Subject: [PATCH 04/25] push

---
 langfuse/_client/client.py      |  41 ++++++-------
 langfuse/_client/datasets.py    |  16 +++--
 langfuse/_client/experiments.py |  32 ++++++----
 tests/test_experiments.py       | 101 +++++++++++++++++++-------------
 4 files changed, 108 insertions(+), 82 deletions(-)

diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py
index 6f3722990..b39ce8a0d 100644
--- a/langfuse/_client/client.py
+++ b/langfuse/_client/client.py
@@ -57,9 +57,13 @@
     LANGFUSE_TRACING_ENVIRONMENT,
 )
 from langfuse._client.experiments import (
+    EvaluatorFunction,
+    ExperimentData,
     ExperimentItem,
     ExperimentItemResult,
     ExperimentResult,
+    RunEvaluatorFunction,
+    TaskFunction,
     _run_evaluator,
     _run_task,
 )
@@ -2458,15 +2462,11 @@ def run_experiment(
         *,
         name: str,
         description: Optional[str] = None,
-        data: Union[
-            List[Union[ExperimentItem, dict, DatasetItem]], List[DatasetItemClient]
-        ],
-        task: Callable[
-            [Union[ExperimentItem, dict, DatasetItem, DatasetItemClient]], Any
-        ],
-        evaluators: Optional[List[Callable]] = None,
-        run_evaluators: Optional[List[Callable]] = None,
-        max_concurrency: Optional[int] = None,
+        data: ExperimentData,
+        task: TaskFunction,
+        evaluators: List[EvaluatorFunction] = [],
+        run_evaluators: List[RunEvaluatorFunction] = [],
+        max_concurrency: int = 50,
         metadata: Optional[Dict[str, Any]] = None,
     ) -> ExperimentResult:
         """Run an experiment on a dataset with automatic tracing and evaluation.
@@ -2524,27 +2524,20 @@ async def _run_experiment_async(
         *,
         name: str,
         description: Optional[str],
-        data: Union[
-            List[Union[ExperimentItem, dict, DatasetItem]], List[DatasetItemClient]
-        ],
-        task: Callable,
-        evaluators: List[Callable],
-        run_evaluators: List[Callable],
-        max_concurrency: Optional[int],
+        data: ExperimentData,
+        task: TaskFunction,
+        evaluators: List[EvaluatorFunction],
+        run_evaluators: List[RunEvaluatorFunction],
+        max_concurrency: int,
         metadata: Dict[str, Any],
     ) -> ExperimentResult:
         langfuse_logger.debug(f"Starting experiment '{name}' with {len(data)} items")
 
         # Set up concurrency control
-        max_workers = (
-            max_concurrency if max_concurrency is not None else min(len(data), 10)
-        )
-        semaphore = asyncio.Semaphore(max_workers)
+        semaphore = asyncio.Semaphore(max_concurrency)
 
         # Process all items
-        async def process_item(
-            item: Union[ExperimentItem, dict, DatasetItem, DatasetItemClient],
-        ) -> dict:
+        async def process_item(item: ExperimentItem) -> dict:
             async with semaphore:
                 return await self._process_experiment_item(
                     item, task, evaluators, name, description, metadata
@@ -2620,7 +2613,7 @@ async def process_item(
 
     async def _process_experiment_item(
         self,
-        item: Union[ExperimentItem, dict, DatasetItem, DatasetItemClient],
+        item: ExperimentItem,
         task: Callable,
         evaluators: List[Callable],
         experiment_name: str,
diff --git a/langfuse/_client/datasets.py b/langfuse/_client/datasets.py
index 4589d9d25..af79520b1 100644
--- a/langfuse/_client/datasets.py
+++ b/langfuse/_client/datasets.py
@@ -1,10 +1,14 @@
 import datetime as dt
 import logging
-from .span import LangfuseSpan
 from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional
 
 from opentelemetry.util._decorator import _agnosticcontextmanager
 
+from langfuse._client.experiments import (
+    EvaluatorFunction,
+    RunEvaluatorFunction,
+    TaskFunction,
+)
 from langfuse.model import (
     CreateDatasetRunItemRequest,
     Dataset,
@@ -12,6 +16,8 @@
     DatasetStatus,
 )
 
+from .span import LangfuseSpan
+
 if TYPE_CHECKING:
     from langfuse._client.client import Langfuse
 
@@ -194,10 +200,10 @@ def run_experiment(
         *,
         name: str,
         description: Optional[str] = None,
-        task: Any,
-        evaluators: Optional[List[Any]] = None,
-        run_evaluators: Optional[List[Any]] = None,
-        max_concurrency: Optional[int] = None,
+        task: TaskFunction,
+        evaluators: List[EvaluatorFunction] = [],
+        run_evaluators: List[RunEvaluatorFunction] = [],
+        max_concurrency: int = 50,
         metadata: Optional[Dict[str, Any]] = None,
     ) -> Any:
         """Run an experiment on this dataset.
diff --git a/langfuse/_client/experiments.py b/langfuse/_client/experiments.py
index db27153e0..65aad2649 100644
--- a/langfuse/_client/experiments.py
+++ b/langfuse/_client/experiments.py
@@ -19,13 +19,11 @@
     Union,
 )
 
-from langfuse.model import DatasetItem
-
 if TYPE_CHECKING:
     from langfuse._client.datasets import DatasetItemClient
 
 
-class ExperimentItem(TypedDict, total=False):
+class LocalExperimentItem(TypedDict, total=False):
     """Structure for experiment data items.
 
     Args:
@@ -39,6 +37,10 @@ class ExperimentItem(TypedDict, total=False):
     metadata: Optional[Dict[str, Any]]
 
 
+ExperimentItem = Union[LocalExperimentItem, DatasetItemClient]
+ExperimentData = Union[List[LocalExperimentItem], List[DatasetItemClient]]
+
+
 class Evaluation(TypedDict, total=False):
     """Structure for evaluation results.
 
@@ -66,7 +68,7 @@ class ExperimentItemResult(TypedDict):
         dataset_run_id: Dataset run ID if this item was part of a Langfuse dataset
     """
 
-    item: Union[ExperimentItem, DatasetItem]
+    item: ExperimentItem
     output: Any
     evaluations: List[Evaluation]
     trace_id: Optional[str]
@@ -93,7 +95,10 @@ class TaskFunction(Protocol):
     """Protocol for experiment task functions."""
 
     def __call__(
-        self, item: Union[ExperimentItem, dict, DatasetItem, "DatasetItemClient"]
+        self,
+        *,
+        item: ExperimentItem,
+        **kwargs: Dict[str, Any],
     ) -> Union[Any, Awaitable[Any]]:
         """Execute the task on an experiment item.
 
@@ -116,6 +121,7 @@ def __call__(
         output: Any,
         expected_output: Any = None,
         metadata: Optional[Dict[str, Any]] = None,
+        **kwargs: Dict[str, Any],
     ) -> Union[
         Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
     ]:
@@ -137,7 +143,10 @@ class RunEvaluatorFunction(Protocol):
     """Protocol for run-level evaluator functions."""
 
     def __call__(
-        self, *, item_results: List[ExperimentItemResult]
+        self,
+        *,
+        item_results: List[ExperimentItemResult],
+        **kwargs: Dict[str, Any],
     ) -> Union[
         Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
     ]:
@@ -286,7 +295,7 @@ def _format_value(value: Any) -> str:
 
 
 async def _run_evaluator(
-    evaluator: EvaluatorFunction, **kwargs: Any
+    evaluator: Union[EvaluatorFunction, RunEvaluatorFunction], **kwargs: Any
 ) -> List[Evaluation]:
     """Run an evaluator function and normalize the result."""
     try:
@@ -299,8 +308,10 @@ async def _run_evaluator(
         # Normalize to list
         if isinstance(result, dict):
             return [result]
+
         elif isinstance(result, list):
             return result
+
         else:
             return []
 
@@ -310,12 +321,9 @@ async def _run_evaluator(
         return []
 
 
-async def _run_task(
-    task: TaskFunction,
-    item: Union[ExperimentItem, dict, DatasetItem, "DatasetItemClient"],
-) -> Any:
+async def _run_task(task: TaskFunction, item: ExperimentItem) -> Any:
     """Run a task function and handle sync/async."""
-    result = task(item)
+    result = task(item=item)
 
     # Handle async tasks
     if asyncio.iscoroutine(result):
diff --git a/tests/test_experiments.py b/tests/test_experiments.py
index 86cf0845c..2a20421ba 100644
--- a/tests/test_experiments.py
+++ b/tests/test_experiments.py
@@ -1,10 +1,17 @@
 """Comprehensive tests for Langfuse experiment functionality matching JS SDK."""
 
 import time
+from typing import Any, Dict, List
 
 import pytest
 
 from langfuse import get_client
+from langfuse._client.experiments import (
+    Evaluation,
+    ExperimentData,
+    ExperimentItem,
+    ExperimentItemResult,
+)
 from tests.utils import create_uuid, get_api
 
 
@@ -18,7 +25,7 @@ def sample_dataset():
     ]
 
 
-def mock_task(item):
+def mock_task(*, item: ExperimentItem, **kwargs: Dict[str, Any]):
     """Mock task function that simulates processing."""
     input_val = (
         item.get("input")
@@ -29,31 +36,37 @@ def mock_task(item):
 
 
 def simple_evaluator(*, input, output, expected_output=None, **kwargs):
-    """Simple evaluator that returns output length."""
-    return {"name": "length_check", "value": len(output)}
+    """Return output length."""
+    return Evaluation(**{"name": "length_check", "value": len(output)})
 
 
 def factuality_evaluator(*, input, output, expected_output=None, **kwargs):
     """Mock factuality evaluator."""
     # Simple mock: check if expected output is in the output
     if expected_output and expected_output.lower() in output.lower():
-        return {"name": "factuality", "value": 1.0, "comment": "Correct answer found"}
-    return {"name": "factuality", "value": 0.0, "comment": "Incorrect answer"}
+        return Evaluation(
+            **{"name": "factuality", "value": 1.0, "comment": "Correct answer found"}
+        )
+    return Evaluation(
+        **{"name": "factuality", "value": 0.0, "comment": "Incorrect answer"}
+    )
 
 
-def run_evaluator_average_length(*, item_results, **kwargs):
+def run_evaluator_average_length(*, item_results: List[ExperimentItemResult], **kwargs):
     """Run evaluator that calculates average output length."""
     if not item_results:
-        return {"name": "average_length", "value": 0}
+        return Evaluation(**{"name": "average_length", "value": 0})
 
     avg_length = sum(len(r["output"]) for r in item_results) / len(item_results)
-    return {"name": "average_length", "value": avg_length}
+
+    return Evaluation(**{"name": "average_length", "value": avg_length})
 
 
 # Basic Functionality Tests
 def test_run_experiment_on_local_dataset(sample_dataset):
     """Test running experiment on local dataset."""
     langfuse_client = get_client()
+
     result = langfuse_client.run_experiment(
         name="Euro capitals",
         description="Country capital experiment",
@@ -139,12 +152,12 @@ def failing_evaluator(**kwargs):
         raise Exception("Evaluator failed")
 
     def working_evaluator(**kwargs):
-        return {"name": "working_eval", "value": 1.0}
+        return Evaluation(**{"name": "working_eval", "value": 1.0})
 
     result = langfuse_client.run_experiment(
         name="Error test",
         data=[{"input": "test"}],
-        task=lambda x: "result",
+        task=lambda **kwargs: "result",
         evaluators=[working_evaluator, failing_evaluator],
     )
 
@@ -200,7 +213,7 @@ def failing_run_evaluator(**kwargs):
     result = langfuse_client.run_experiment(
         name="Run evaluator error test",
         data=[{"input": "test"}],
-        task=lambda x: "result",
+        task=lambda **kwargs: "result",
         run_evaluators=[failing_run_evaluator],
     )
 
@@ -220,7 +233,7 @@ def test_empty_dataset_handling():
     result = langfuse_client.run_experiment(
         name="Empty dataset test",
         data=[],
-        task=lambda x: "result",
+        task=lambda **kwargs: "result",
         run_evaluators=[run_evaluator_average_length],
     )
 
@@ -244,7 +257,7 @@ def test_dataset_with_missing_fields():
     result = langfuse_client.run_experiment(
         name="Incomplete data test",
         data=incomplete_dataset,
-        task=lambda x: "result",
+        task=lambda **kwargs: "result",
     )
 
     # Should handle missing fields gracefully
@@ -261,14 +274,14 @@ def test_large_dataset_with_concurrency():
     """Test handling large dataset with concurrency control."""
     langfuse_client = get_client()
 
-    large_dataset = [
+    large_dataset: ExperimentData = [
         {"input": f"Item {i}", "expected_output": f"Output {i}"} for i in range(20)
     ]
 
     result = langfuse_client.run_experiment(
         name="Large dataset test",
         data=large_dataset,
-        task=lambda x: f"Processed {x['input']}",
+        task=lambda **kwargs: f"Processed {kwargs['input']}",
         evaluators=[lambda **kwargs: {"name": "simple_eval", "value": 1.0}],
         max_concurrency=5,
     )
@@ -288,12 +301,14 @@ def test_single_evaluation_return():
     langfuse_client = get_client()
 
     def single_evaluator(**kwargs):
-        return {"name": "single_eval", "value": 1, "comment": "Single evaluation"}
+        return Evaluation(
+            **{"name": "single_eval", "value": 1, "comment": "Single evaluation"}
+        )
 
     result = langfuse_client.run_experiment(
         name="Single evaluation test",
         data=[{"input": "test"}],
-        task=lambda x: "result",
+        task=lambda **kwargs: "result",
         evaluators=[single_evaluator],
     )
 
@@ -312,8 +327,7 @@ def test_no_evaluators():
     result = langfuse_client.run_experiment(
         name="No evaluators test",
         data=[{"input": "test"}],
-        task=lambda x: "result",
-        evaluators=[],
+        task=lambda **kwargs: "result",
     )
 
     assert len(result["item_results"]) == 1
@@ -329,17 +343,18 @@ def test_only_run_evaluators():
     langfuse_client = get_client()
 
     def run_only_evaluator(**kwargs):
-        return {
-            "name": "run_only_eval",
-            "value": 10,
-            "comment": "Run-level evaluation",
-        }
+        return Evaluation(
+            **{
+                "name": "run_only_eval",
+                "value": 10,
+                "comment": "Run-level evaluation",
+            }
+        )
 
     result = langfuse_client.run_experiment(
         name="Only run evaluators test",
         data=[{"input": "test"}],
-        task=lambda x: "result",
-        evaluators=[],
+        task=lambda **kwargs: "result",
         run_evaluators=[run_only_evaluator],
     )
 
@@ -357,18 +372,18 @@ def test_different_data_types():
     langfuse_client = get_client()
 
     def number_evaluator(**kwargs):
-        return {"name": "number_eval", "value": 42}
+        return Evaluation(**{"name": "number_eval", "value": 42})
 
     def string_evaluator(**kwargs):
-        return {"name": "string_eval", "value": "excellent"}
+        return Evaluation(**{"name": "string_eval", "value": "excellent"})
 
     def boolean_evaluator(**kwargs):
-        return {"name": "boolean_eval", "value": True}
+        return Evaluation(**{"name": "boolean_eval", "value": True})
 
     result = langfuse_client.run_experiment(
         name="Different data types test",
         data=[{"input": "test"}],
-        task=lambda x: "result",
+        task=lambda **kwargs: "result",
         evaluators=[number_evaluator, string_evaluator, boolean_evaluator],
     )
 
@@ -402,18 +417,22 @@ def test_scores_are_persisted():
     dataset = langfuse_client.get_dataset(dataset_name)
 
     def test_evaluator(**kwargs):
-        return {
-            "name": "persistence_test",
-            "value": 0.85,
-            "comment": "Test evaluation for persistence",
-        }
+        return Evaluation(
+            **{
+                "name": "persistence_test",
+                "value": 0.85,
+                "comment": "Test evaluation for persistence",
+            }
+        )
 
     def test_run_evaluator(**kwargs):
-        return {
-            "name": "persistence_run_test",
-            "value": 0.9,
-            "comment": "Test run evaluation for persistence",
-        }
+        return Evaluation(
+            **{
+                "name": "persistence_run_test",
+                "value": 0.9,
+                "comment": "Test run evaluation for persistence",
+            }
+        )
 
     result = dataset.run_experiment(
         name="Score persistence test",
@@ -506,7 +525,7 @@ def test_format_experiment_results_basic():
         name="Formatting test",
         description="Test result formatting",
         data=[{"input": "Hello", "expected_output": "Hi"}],
-        task=lambda x: f"Processed: {x['input']}",
+        task=lambda **kwargs: f"Processed: {kwargs['input']}",
         evaluators=[simple_evaluator],
         run_evaluators=[run_evaluator_average_length],
     )

From f5f2cacc3783303f66559fb0a7cddd175e53f875 Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Thu, 11 Sep 2025 17:26:00 +0200
Subject: [PATCH 05/25] push

---
 langfuse/_client/client.py      | 16 +++++++---------
 langfuse/_client/experiments.py |  4 ++--
 tests/test_experiments.py       |  4 ++--
 3 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py
index b39ce8a0d..cccebb1b9 100644
--- a/langfuse/_client/client.py
+++ b/langfuse/_client/client.py
@@ -2670,15 +2670,13 @@ async def _process_experiment_item(
                     try:
                         from langfuse.model import CreateDatasetRunItemRequest
 
-                        dataset_run_item = (
-                            await self.async_api.dataset_run_items.create(
-                                request=CreateDatasetRunItemRequest(
-                                    runName=experiment_name,
-                                    runDescription=experiment_description,
-                                    metadata=experiment_metadata,
-                                    datasetItemId=item.id,  # type: ignore
-                                    traceId=trace_id,
-                                )
+                        dataset_run_item = self.api.dataset_run_items.create(
+                            request=CreateDatasetRunItemRequest(
+                                runName=experiment_name,
+                                runDescription=experiment_description,
+                                metadata=experiment_metadata,
+                                datasetItemId=item.id,  # type: ignore
+                                traceId=trace_id,
                             )
                         )
 
diff --git a/langfuse/_client/experiments.py b/langfuse/_client/experiments.py
index 65aad2649..0a80e25ac 100644
--- a/langfuse/_client/experiments.py
+++ b/langfuse/_client/experiments.py
@@ -37,8 +37,8 @@ class LocalExperimentItem(TypedDict, total=False):
     metadata: Optional[Dict[str, Any]]
 
 
-ExperimentItem = Union[LocalExperimentItem, DatasetItemClient]
-ExperimentData = Union[List[LocalExperimentItem], List[DatasetItemClient]]
+ExperimentItem = Union[LocalExperimentItem, "DatasetItemClient"]
+ExperimentData = Union[List[LocalExperimentItem], List["DatasetItemClient"]]
 
 
 class Evaluation(TypedDict, total=False):
diff --git a/tests/test_experiments.py b/tests/test_experiments.py
index 2a20421ba..f9be524e3 100644
--- a/tests/test_experiments.py
+++ b/tests/test_experiments.py
@@ -281,7 +281,7 @@ def test_large_dataset_with_concurrency():
     result = langfuse_client.run_experiment(
         name="Large dataset test",
         data=large_dataset,
-        task=lambda **kwargs: f"Processed {kwargs['input']}",
+        task=lambda **kwargs: f"Processed {kwargs['item']}",
         evaluators=[lambda **kwargs: {"name": "simple_eval", "value": 1.0}],
         max_concurrency=5,
     )
@@ -525,7 +525,7 @@ def test_format_experiment_results_basic():
         name="Formatting test",
         description="Test result formatting",
         data=[{"input": "Hello", "expected_output": "Hi"}],
-        task=lambda **kwargs: f"Processed: {kwargs['input']}",
+        task=lambda **kwargs: f"Processed: {kwargs['item']}",
         evaluators=[simple_evaluator],
         run_evaluators=[run_evaluator_average_length],
     )

From ce290f5d8705c6e4952226552cbaf3c4b8df53e5 Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Thu, 11 Sep 2025 17:38:50 +0200
Subject: [PATCH 06/25] expand tests

---
 tests/test_experiments.py | 150 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 149 insertions(+), 1 deletion(-)

diff --git a/tests/test_experiments.py b/tests/test_experiments.py
index f9be524e3..c278243ab 100644
--- a/tests/test_experiments.py
+++ b/tests/test_experiments.py
@@ -96,6 +96,48 @@ def test_run_experiment_on_local_dataset(sample_dataset):
     langfuse_client.flush()
     time.sleep(2)
 
+    # Validate traces are correctly persisted with input/output/metadata
+    api = get_api()
+    expected_inputs = ["Germany", "France", "Spain"]
+    expected_outputs = ["Capital of Germany", "Capital of France", "Capital of Spain"]
+
+    for i, item_result in enumerate(result["item_results"]):
+        trace_id = item_result["trace_id"]
+        assert trace_id is not None, f"Item {i} should have a trace_id"
+
+        # Fetch trace from API
+        trace = api.trace.get(trace_id)
+        assert trace is not None, f"Trace {trace_id} should exist"
+
+        # Validate trace name
+        assert (
+            trace.name == "experiment-item-run"
+        ), f"Trace {trace_id} should have correct name"
+
+        # Validate trace input - should contain the experiment item
+        assert trace.input is not None, f"Trace {trace_id} should have input"
+        expected_input = expected_inputs[i]
+        # The input should contain the item data in some form
+        assert expected_input in str(
+            trace.input
+        ), f"Trace {trace_id} input should contain '{expected_input}'"
+
+        # Validate trace output - should be the task result
+        assert trace.output is not None, f"Trace {trace_id} should have output"
+        expected_output = expected_outputs[i]
+        assert (
+            trace.output == expected_output
+        ), f"Trace {trace_id} output should be '{expected_output}', got '{trace.output}'"
+
+        # Validate trace metadata contains experiment name
+        assert trace.metadata is not None, f"Trace {trace_id} should have metadata"
+        assert (
+            "experiment_name" in trace.metadata
+        ), f"Trace {trace_id} metadata should contain experiment_name"
+        assert (
+            trace.metadata["experiment_name"] == "Euro capitals"
+        ), f"Trace {trace_id} metadata should have correct experiment_name"
+
 
 def test_run_experiment_on_langfuse_dataset():
     """Test running experiment on Langfuse dataset."""
@@ -120,8 +162,10 @@ def test_run_experiment_on_langfuse_dataset():
     # Get dataset and run experiment
     dataset = langfuse_client.get_dataset(dataset_name)
 
+    # Use unique experiment name for proper identification
+    experiment_name = "Dataset Test " + create_uuid()[:8]
     result = dataset.run_experiment(
-        name="Dataset Test",
+        name=experiment_name,
         description="Test on Langfuse dataset",
         task=mock_task,
         evaluators=[factuality_evaluator],
@@ -142,6 +186,110 @@ def test_run_experiment_on_langfuse_dataset():
     runs = api.datasets.get_runs(dataset_name)
     assert len(runs.data) >= 1
 
+    # Validate traces are correctly persisted with input/output/metadata
+    expected_data = {"Germany": "Capital of Germany", "France": "Capital of France"}
+    dataset_run_id = result["dataset_run_id"]
+
+    # Create a mapping from dataset item ID to dataset item for validation
+    dataset_item_map = {item.id: item for item in dataset.items}
+
+    for i, item_result in enumerate(result["item_results"]):
+        trace_id = item_result["trace_id"]
+        assert trace_id is not None, f"Item {i} should have a trace_id"
+
+        # Fetch trace from API
+        trace = api.trace.get(trace_id)
+        assert trace is not None, f"Trace {trace_id} should exist"
+
+        # Validate trace name
+        assert (
+            trace.name == "experiment-item-run"
+        ), f"Trace {trace_id} should have correct name"
+
+        # Validate trace input and output match expected pairs
+        assert trace.input is not None, f"Trace {trace_id} should have input"
+        trace_input_str = str(trace.input)
+
+        # Find which expected input this trace corresponds to
+        matching_input = None
+        for expected_input in expected_data.keys():
+            if expected_input in trace_input_str:
+                matching_input = expected_input
+                break
+
+        assert (
+            matching_input is not None
+        ), f"Trace {trace_id} input '{trace_input_str}' should contain one of {list(expected_data.keys())}"
+
+        # Validate trace output matches the expected output for this input
+        assert trace.output is not None, f"Trace {trace_id} should have output"
+        expected_output = expected_data[matching_input]
+        assert (
+            trace.output == expected_output
+        ), f"Trace {trace_id} output should be '{expected_output}', got '{trace.output}'"
+
+        # Validate trace metadata contains experiment and dataset info
+        assert trace.metadata is not None, f"Trace {trace_id} should have metadata"
+        assert (
+            "experiment_name" in trace.metadata
+        ), f"Trace {trace_id} metadata should contain experiment_name"
+        assert (
+            trace.metadata["experiment_name"] == experiment_name
+        ), f"Trace {trace_id} metadata should have correct experiment_name"
+
+        # Validate dataset-specific metadata fields
+        assert (
+            "dataset_id" in trace.metadata
+        ), f"Trace {trace_id} metadata should contain dataset_id"
+        assert (
+            trace.metadata["dataset_id"] == dataset.id
+        ), f"Trace {trace_id} metadata should have correct dataset_id"
+
+        assert (
+            "dataset_item_id" in trace.metadata
+        ), f"Trace {trace_id} metadata should contain dataset_item_id"
+        # Get the dataset item ID from metadata and validate it exists
+        dataset_item_id = trace.metadata["dataset_item_id"]
+        assert (
+            dataset_item_id in dataset_item_map
+        ), f"Trace {trace_id} metadata dataset_item_id should correspond to a valid dataset item"
+
+        # Validate the dataset item input matches the trace input
+        dataset_item = dataset_item_map[dataset_item_id]
+        assert (
+            dataset_item.input == matching_input
+        ), f"Trace {trace_id} should correspond to dataset item with input '{matching_input}'"
+
+    # Verify dataset run contains the correct trace IDs
+    dataset_run = None
+    for run in runs.data:
+        if run.id == dataset_run_id:
+            dataset_run = run
+            break
+
+    assert dataset_run is not None, f"Dataset run {dataset_run_id} should exist"
+    assert dataset_run.name == experiment_name, "Dataset run should have correct name"
+    assert (
+        dataset_run.description == "Test on Langfuse dataset"
+    ), "Dataset run should have correct description"
+
+    # Get dataset run items to verify trace linkage
+    dataset_run_items = api.dataset_run_items.list(
+        dataset_id=dataset.id, run_name=experiment_name
+    )
+    assert len(dataset_run_items.data) == 2, "Dataset run should have 2 items"
+
+    # Verify each dataset run item links to the correct trace
+    run_item_trace_ids = {
+        item.trace_id for item in dataset_run_items.data if item.trace_id
+    }
+    result_trace_ids = {item["trace_id"] for item in result["item_results"]}
+
+    assert run_item_trace_ids == result_trace_ids, (
+        f"Dataset run items should link to the same traces as experiment results. "
+        f"Run items: {run_item_trace_ids}, Results: {result_trace_ids}"
+    )
+
 
 # Error Handling Tests
 def test_evaluator_failures_handled_gracefully():

From 477a1c9abae9262f2061238bd139d628e15e151d Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Thu, 11 Sep 2025 17:58:43 +0200
Subject: [PATCH 07/25] expand docstrings

---
 langfuse/_client/client.py      | 158 +++++++++--
 langfuse/_client/datasets.py    | 183 ++++++++++--
 langfuse/_client/experiments.py | 473 ++++++++++++++++++++++++++++++--
 langfuse/types.py               |  55 +++-
 4 files changed, 796 insertions(+), 73 deletions(-)

diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py
index cccebb1b9..678478d7d 100644
--- a/langfuse/_client/client.py
+++ b/langfuse/_client/client.py
@@ -2472,39 +2472,151 @@ def run_experiment(
         """Run an experiment on a dataset with automatic tracing and evaluation.
 
         This method executes a task function on each item in the provided dataset,
-        traces the execution with Langfuse, runs evaluators on the outputs,
-        and returns formatted results.
+        automatically traces all executions with Langfuse for observability, runs
+        item-level and run-level evaluators on the outputs, and returns comprehensive
+        results with evaluation metrics.
+
+        The experiment system provides:
+        - Automatic tracing of all task executions
+        - Concurrent processing with configurable limits
+        - Comprehensive error handling that isolates failures
+        - Integration with Langfuse datasets for experiment tracking
+        - Flexible evaluation framework supporting both sync and async evaluators
 
         Args:
-            name: Human-readable name for the experiment
-            description: Optional description of the experiment's purpose
-            data: Array of data items to process (ExperimentItem or DatasetItem)
-            task: Function that processes each data item and returns output
-            evaluators: Optional list of functions to evaluate each item's output
-            run_evaluators: Optional list of functions to evaluate the entire experiment
-            max_concurrency: Maximum number of concurrent task executions
-            metadata: Optional metadata to attach to the experiment
+            name: Human-readable name for the experiment. Used for identification
+                in the Langfuse UI and for dataset run naming if using Langfuse datasets.
+            description: Optional description explaining the experiment's purpose,
+                methodology, or expected outcomes.
+            data: Array of data items to process. Can be either:
+                - List of dict-like items with 'input', 'expected_output', 'metadata' keys
+                - List of Langfuse DatasetItem objects from dataset.items
+            task: Function that processes each data item and returns output.
+                Must accept 'item' as keyword argument and can return sync or async results.
+                The task function signature should be: task(*, item, **kwargs) -> Any
+            evaluators: List of functions to evaluate each item's output individually.
+                Each evaluator receives input, output, expected_output, and metadata.
+                Can return single Evaluation dict or list of Evaluation dicts.
+            run_evaluators: List of functions to evaluate the entire experiment run.
+                Each run evaluator receives all item_results and can compute aggregate metrics.
+                Useful for calculating averages, distributions, or cross-item comparisons.
+            max_concurrency: Maximum number of concurrent task executions (default: 50).
+                Controls the number of items processed simultaneously. Adjust based on
+                API rate limits and system resources.
+            metadata: Optional metadata dictionary to attach to all experiment traces.
+                This metadata will be included in every trace created during the experiment.
 
         Returns:
-            ExperimentResult containing item results, evaluations, and formatting functions
+            ExperimentResult dictionary containing:
+            - item_results: List of results for each processed item with outputs and evaluations
+            - run_evaluations: List of aggregate evaluation results for the entire run
+            - dataset_run_id: ID of the dataset run (if using Langfuse datasets)
+            - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
 
-        Example:
+        Raises:
+            ValueError: If required parameters are missing or invalid
+            Exception: If experiment setup fails (individual item failures are handled gracefully)
+
+        Examples:
+            Basic experiment with local data:
             ```python
-            def task(item):
-                return f"Processed: {item['input']}"
+            def summarize_text(*, item, **kwargs):
+                return f"Summary: {item['input'][:50]}..."
 
-            def evaluator(*, input, output, expected_output=None, **kwargs):
-                return {"name": "length", "value": len(output)}
+            def length_evaluator(*, input, output, expected_output=None, **kwargs):
+                return {
+                    "name": "output_length",
+                    "value": len(output),
+                    "comment": f"Output contains {len(output)} characters"
+                }
 
             result = langfuse.run_experiment(
-                name="Test Experiment",
-                data=[{"input": "test", "expected_output": "expected"}],
-                task=task,
-                evaluators=[evaluator]
+                name="Text Summarization Test",
+                description="Evaluate summarization quality and length",
+                data=[
+                    {"input": "Long article text...", "expected_output": "Expected summary"},
+                    {"input": "Another article...", "expected_output": "Another summary"}
+                ],
+                task=summarize_text,
+                evaluators=[length_evaluator]
             )
 
-            print(result["item_results"])
+            print(f"Processed {len(result['item_results'])} items")
+            for item_result in result["item_results"]:
+                print(f"Input: {item_result['item']['input']}")
+                print(f"Output: {item_result['output']}")
+                print(f"Evaluations: {item_result['evaluations']}")
             ```
+
+            Advanced experiment with async task and multiple evaluators:
+            ```python
+            async def llm_task(*, item, **kwargs):
+                # Simulate async LLM call
+                response = await openai_client.chat.completions.create(
+                    model="gpt-4",
+                    messages=[{"role": "user", "content": item["input"]}]
+                )
+                return response.choices[0].message.content
+
+            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
+                if expected_output and expected_output.lower() in output.lower():
+                    return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
+                return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}
+
+            def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
+                # Simulate toxicity check
+                toxicity_score = check_toxicity(output)  # Your toxicity checker
+                return {
+                    "name": "toxicity",
+                    "value": toxicity_score,
+                    "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
+                }
+
+            def average_accuracy(*, item_results, **kwargs):
+                accuracies = [
+                    eval["value"] for result in item_results
+                    for eval in result["evaluations"]
+                    if eval["name"] == "accuracy"
+                ]
+                return {
+                    "name": "average_accuracy",
+                    "value": sum(accuracies) / len(accuracies) if accuracies else 0,
+                    "comment": f"Average accuracy across {len(accuracies)} items"
+                }
+
+            result = langfuse.run_experiment(
+                name="LLM Safety and Accuracy Test",
+                description="Evaluate model accuracy and safety across diverse prompts",
+                data=test_dataset,  # Your dataset items
+                task=llm_task,
+                evaluators=[accuracy_evaluator, toxicity_evaluator],
+                run_evaluators=[average_accuracy],
+                max_concurrency=5,  # Limit concurrent API calls
+                metadata={"model": "gpt-4", "temperature": 0.7}
+            )
+            ```
+
+            Using with Langfuse datasets:
+            ```python
+            # Get dataset from Langfuse
+            dataset = langfuse.get_dataset("my-eval-dataset")
+
+            result = dataset.run_experiment(
+                name="Production Model Evaluation",
+                description="Monthly evaluation of production model performance",
+                task=my_production_task,
+                evaluators=[accuracy_evaluator, latency_evaluator]
+            )
+
+            # Results automatically linked to dataset in Langfuse UI
+            print(f"View results: {result['dataset_run_url']}")
+            ```
+
+        Note:
+            - Task and evaluator functions can be either synchronous or asynchronous
+            - Individual item failures are logged but don't stop the experiment
+            - All executions are automatically traced and visible in Langfuse UI
+            - When using Langfuse datasets, results are automatically linked for easy comparison
         """
         return asyncio.run(
             self._run_experiment_async(
@@ -2596,7 +2708,7 @@ async def process_item(item: ExperimentItem) -> dict:
                     self.create_score(
                         dataset_run_id=dataset_run_id,
                         name=evaluation["name"],
-                        value=evaluation["value"],
+                        value=evaluation["value"],  # type: ignore
                         comment=evaluation.get("comment"),
                         metadata=evaluation.get("metadata"),
                     )
@@ -2718,7 +2830,7 @@ async def _process_experiment_item(
                             self.create_score(
                                 trace_id=trace_id,
                                 name=evaluation.get("name", "unknown"),
-                                value=evaluation.get("value", -1),
+                                value=evaluation.get("value", -1),  # type: ignore
                                 comment=evaluation.get("comment"),
                                 metadata=evaluation.get("metadata"),
                             )
diff --git a/langfuse/_client/datasets.py b/langfuse/_client/datasets.py
index af79520b1..cab0d98b6 100644
--- a/langfuse/_client/datasets.py
+++ b/langfuse/_client/datasets.py
@@ -206,41 +206,182 @@ def run_experiment(
         max_concurrency: int = 50,
         metadata: Optional[Dict[str, Any]] = None,
     ) -> Any:
-        """Run an experiment on this dataset.
+        """Run an experiment on this Langfuse dataset with automatic tracking.
 
-        This is a convenience method that calls the Langfuse client's run_experiment
-        method with this dataset's items as the data.
+        This is a convenience method that runs an experiment using all items in this
+        dataset. It automatically creates a dataset run in Langfuse for tracking and
+        comparison purposes, linking all experiment results to the dataset.
+
+        Key benefits of using dataset.run_experiment():
+        - Automatic dataset run creation and linking in Langfuse UI
+        - Built-in experiment tracking and versioning
+        - Easy comparison between different experiment runs
+        - Direct access to dataset items with their metadata and expected outputs
+        - Automatic URL generation for viewing results in Langfuse dashboard
 
         Args:
-            name: Human-readable name for the experiment
-            description: Optional description of the experiment's purpose
-            task: Function that processes each data item and returns output
-            evaluators: Optional list of functions to evaluate each item's output
-            run_evaluators: Optional list of functions to evaluate the entire experiment
-            max_concurrency: Maximum number of concurrent task executions
-            metadata: Optional metadata to attach to the experiment
+            name: Human-readable name for the experiment run. This will be used as
+                the dataset run name in Langfuse for tracking and identification.
+            description: Optional description of the experiment's purpose, methodology,
+                or what you're testing. Appears in the Langfuse UI for context.
+            task: Function that processes each dataset item and returns output.
+                The function will receive DatasetItem objects with .input, .expected_output,
+                .metadata attributes. Signature should be: task(*, item, **kwargs) -> Any
+            evaluators: List of functions to evaluate each item's output individually.
+                These will have access to the item's expected_output for comparison.
+            run_evaluators: List of functions to evaluate the entire experiment run.
+                Useful for computing aggregate statistics across all dataset items.
+            max_concurrency: Maximum number of concurrent task executions (default: 50).
+                Adjust based on API rate limits and system resources.
+            metadata: Optional metadata to attach to the experiment run and all traces.
+                Will be combined with individual item metadata.
 
         Returns:
-            ExperimentResult containing item results, evaluations, and formatting functions
+            ExperimentResult dictionary containing:
+            - item_results: Results for each dataset item with outputs and evaluations
+            - run_evaluations: Aggregate evaluation results for the entire run
+            - dataset_run_id: ID of the created dataset run in Langfuse
+            - dataset_run_url: Direct URL to view the experiment results in Langfuse UI
+
+        Raises:
+            ValueError: If the dataset has no items or no Langfuse client is available
 
-        Example:
+        Examples:
+            Basic dataset experiment:
             ```python
-            dataset = langfuse.get_dataset("my-dataset")
+            dataset = langfuse.get_dataset("qa-evaluation-set")
+
+            def answer_questions(*, item, **kwargs):
+                # item is a DatasetItem with .input, .expected_output, .metadata
+                question = item.input
+                return my_qa_system.answer(question)
+
+            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
+                if not expected_output:
+                    return {"name": "accuracy", "value": None, "comment": "No expected output"}
+
+                is_correct = output.strip().lower() == expected_output.strip().lower()
+                return {
+                    "name": "accuracy",
+                    "value": 1.0 if is_correct else 0.0,
+                    "comment": "Correct" if is_correct else "Incorrect"
+                }
 
-            def task(item):
-                return f"Processed: {item.input}"
+            result = dataset.run_experiment(
+                name="QA System v2.0 Evaluation",
+                description="Testing improved QA system on curated question set",
+                task=answer_questions,
+                evaluators=[accuracy_evaluator]
+            )
 
-            def evaluator(*, input, output, expected_output=None, **kwargs):
-                return {"name": "length", "value": len(output)}
+            print(f"Evaluated {len(result['item_results'])} questions")
+            print(f"View detailed results: {result['dataset_run_url']}")
+            ```
+
+            Advanced experiment with multiple evaluators and run-level analysis:
+            ```python
+            dataset = langfuse.get_dataset("content-generation-benchmark")
+
+            async def generate_content(*, item, **kwargs):
+                prompt = item.input
+                response = await openai_client.chat.completions.create(
+                    model="gpt-4",
+                    messages=[{"role": "user", "content": prompt}],
+                    temperature=0.7
+                )
+                return response.choices[0].message.content
+
+            def quality_evaluator(*, input, output, expected_output=None, metadata=None, **kwargs):
+                # Use metadata for context-aware evaluation
+                content_type = metadata.get("type", "general") if metadata else "general"
+
+                # Basic quality checks
+                word_count = len(output.split())
+                min_words = {"blog": 300, "tweet": 10, "summary": 100}.get(content_type, 50)
+
+                return [
+                    {
+                        "name": "word_count",
+                        "value": word_count,
+                        "comment": f"Generated {word_count} words"
+                    },
+                    {
+                        "name": "meets_length_requirement",
+                        "value": word_count >= min_words,
+                        "comment": f"{'Meets' if word_count >= min_words else 'Below'} minimum {min_words} words for {content_type}"
+                    }
+                ]
+
+            def content_diversity(*, item_results, **kwargs):
+                # Analyze diversity across all generated content
+                all_outputs = [result["output"] for result in item_results]
+                unique_words = set()
+                total_words = 0
+
+                for output in all_outputs:
+                    words = output.lower().split()
+                    unique_words.update(words)
+                    total_words += len(words)
+
+                diversity_ratio = len(unique_words) / total_words if total_words > 0 else 0
+
+                return {
+                    "name": "vocabulary_diversity",
+                    "value": diversity_ratio,
+                    "comment": f"Used {len(unique_words)} unique words out of {total_words} total ({diversity_ratio:.2%} diversity)"
+                }
 
             result = dataset.run_experiment(
-                name="Dataset Test Experiment",
-                task=task,
-                evaluators=[evaluator]
+                name="Content Generation Diversity Test",
+                description="Evaluating content quality and vocabulary diversity across different content types",
+                task=generate_content,
+                evaluators=[quality_evaluator],
+                run_evaluators=[content_diversity],
+                max_concurrency=3,  # Limit API calls
+                metadata={"model": "gpt-4", "temperature": 0.7}
+            )
+
+            # Results are automatically linked to dataset in Langfuse
+            print(f"Experiment completed! View in Langfuse: {result['dataset_run_url']}")
+
+            # Access individual results
+            for i, item_result in enumerate(result["item_results"]):
+                print(f"Item {i+1}: {item_result['evaluations']}")
+            ```
+
+            Comparing different model versions:
+            ```python
+            # Run multiple experiments on the same dataset for comparison
+            dataset = langfuse.get_dataset("model-benchmark")
+
+            # Experiment 1: GPT-4
+            result_gpt4 = dataset.run_experiment(
+                name="GPT-4 Baseline",
+                description="Baseline performance with GPT-4",
+                task=lambda *, item, **kwargs: gpt4_model.generate(item.input),
+                evaluators=[accuracy_evaluator, fluency_evaluator]
             )
 
-            print(result["item_results"])
+            # Experiment 2: Custom model
+            result_custom = dataset.run_experiment(
+                name="Custom Model v1.2",
+                description="Testing our fine-tuned model",
+                task=lambda *, item, **kwargs: custom_model.generate(item.input),
+                evaluators=[accuracy_evaluator, fluency_evaluator]
+            )
+
+            # Both experiments are now visible in Langfuse for easy comparison
+            print("Compare results in Langfuse:")
+            print(f"GPT-4: {result_gpt4['dataset_run_url']}")
+            print(f"Custom: {result_custom['dataset_run_url']}")
             ```
+
+        Note:
+            - All experiment results are automatically tracked in Langfuse as dataset runs
+            - Dataset items provide .input, .expected_output, and .metadata attributes
+            - Results can be easily compared across different experiment runs in the UI
+            - The dataset_run_url provides direct access to detailed results and analysis
+            - Failed items are handled gracefully and logged without stopping the experiment
         """
         langfuse_client = self._get_langfuse_client()
         if not langfuse_client:
diff --git a/langfuse/_client/experiments.py b/langfuse/_client/experiments.py
index 0a80e25ac..8628da489 100644
--- a/langfuse/_client/experiments.py
+++ b/langfuse/_client/experiments.py
@@ -24,12 +24,49 @@
 
 
 class LocalExperimentItem(TypedDict, total=False):
-    """Structure for experiment data items.
-
-    Args:
-        input: The input data to pass to the task function
-        expected_output: Optional expected output for evaluation purposes
-        metadata: Optional metadata for the experiment item
+    """Structure for local experiment data items (not from Langfuse datasets).
+
+    This TypedDict defines the structure for experiment items when using local data
+    rather than Langfuse-hosted datasets. All fields are optional to provide
+    flexibility in data structure.
+
+    Attributes:
+        input: The input data to pass to the task function. Can be any type that
+            your task function can process (string, dict, list, etc.). This is
+            typically the prompt, question, or data that your task will operate on.
+        expected_output: Optional expected/ground truth output for evaluation purposes.
+            Used by evaluators to assess correctness or quality. Can be None if
+            no ground truth is available.
+        metadata: Optional metadata dictionary containing additional context about
+            this specific item. Can include information like difficulty level,
+            category, source, or any other relevant attributes that evaluators
+            might use for context-aware evaluation.
+
+    Examples:
+        Simple text processing item:
+        ```python
+        item: LocalExperimentItem = {
+            "input": "Summarize this article: ...",
+            "expected_output": "Expected summary...",
+            "metadata": {"difficulty": "medium", "category": "news"}
+        }
+        ```
+
+        Classification item:
+        ```python
+        item: LocalExperimentItem = {
+            "input": {"text": "This movie is great!", "context": "movie review"},
+            "expected_output": "positive",
+            "metadata": {"dataset_source": "imdb", "confidence": 0.95}
+        }
+        ```
+
+        Minimal item with only input:
+        ```python
+        item: LocalExperimentItem = {
+            "input": "What is the capital of France?"
+        }
+        ```
     """
 
     input: Any
@@ -38,21 +75,88 @@ class LocalExperimentItem(TypedDict, total=False):
 
 
 ExperimentItem = Union[LocalExperimentItem, "DatasetItemClient"]
+"""Type alias for items that can be processed in experiments.
+
+Can be either:
+- LocalExperimentItem: Dict-like items with 'input', 'expected_output', 'metadata' keys
+- DatasetItemClient: Items from Langfuse datasets with .input, .expected_output, .metadata attributes
+"""
+
 ExperimentData = Union[List[LocalExperimentItem], List["DatasetItemClient"]]
+"""Type alias for experiment datasets.
 
+Represents the collection of items to process in an experiment. Can be either:
+- List[LocalExperimentItem]: Local data items as dictionaries
+- List[DatasetItemClient]: Items from a Langfuse dataset (typically from dataset.items)
+"""
 
-class Evaluation(TypedDict, total=False):
-    """Structure for evaluation results.
 
-    Args:
-        name: Name of the evaluation metric
-        value: The evaluation score/value (numeric or string)
-        comment: Optional comment explaining the evaluation
-        metadata: Optional metadata for the evaluation
+class Evaluation(TypedDict, total=False):
+    """Structure for evaluation results returned by evaluator functions.
+
+    This TypedDict defines the standardized format that all evaluator functions
+    must return. It provides a consistent structure for storing evaluation metrics
+    and their metadata across different types of evaluators.
+
+    Attributes:
+        name: Unique identifier for the evaluation metric. Should be descriptive
+            and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity").
+            Used for aggregation and comparison across experiment runs.
+        value: The evaluation score or result. Can be:
+            - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
+            - String: For categorical results like "positive", "negative", "neutral"
+            - Boolean: For binary assessments like "passes_safety_check"
+            - None: When evaluation cannot be computed (missing data, API errors, etc.)
+        comment: Optional human-readable explanation of the evaluation result.
+            Useful for providing context, explaining scoring rationale, or noting
+            special conditions. Displayed in Langfuse UI for interpretability.
+        metadata: Optional structured metadata about the evaluation process.
+            Can include confidence scores, intermediate calculations, model versions,
+            or any other relevant technical details.
+
+    Examples:
+        Quantitative accuracy evaluation:
+        ```python
+        accuracy_result: Evaluation = {
+            "name": "accuracy",
+            "value": 0.85,
+            "comment": "85% of responses were correct",
+            "metadata": {"total_items": 100, "correct_items": 85}
+        }
+        ```
+
+        Qualitative assessment:
+        ```python
+        sentiment_result: Evaluation = {
+            "name": "sentiment",
+            "value": "positive",
+            "comment": "Response expresses optimistic viewpoint",
+            "metadata": {"confidence": 0.92, "model": "sentiment-analyzer-v2"}
+        }
+        ```
+
+        Binary check:
+        ```python
+        safety_result: Evaluation = {
+            "name": "safety_check",
+            "value": True,
+            "comment": "Content passes all safety filters"
+        }
+        ```
+
+        Failed evaluation:
+        ```python
+        failed_result: Evaluation = {
+            "name": "external_api_score",
+            "value": None,
+            "comment": "External API unavailable",
+            "metadata": {"error": "timeout", "retry_count": 3}
+        }
+        ```
     """
 
     name: str
-    value: Union[int, float, str, bool]
+    value: Union[int, float, str, bool, None]
     comment: Optional[str]
     metadata: Optional[Dict[str, Any]]
 
@@ -92,7 +196,18 @@ class ExperimentResult(TypedDict):
 
 
 class TaskFunction(Protocol):
-    """Protocol for experiment task functions."""
+    """Protocol defining the interface for experiment task functions.
+
+    Task functions are the core processing functions that operate on each item
+    in an experiment dataset. They receive an experiment item as input and
+    produce some output that will be evaluated.
+
+    Task functions must:
+    - Accept 'item' as a keyword argument
+    - Return any type of output (will be passed to evaluators)
+    - Can be either synchronous or asynchronous
+    - Should handle their own errors gracefully (exceptions will be logged)
+    """
 
     def __call__(
         self,
@@ -102,17 +217,72 @@ def __call__(
     ) -> Union[Any, Awaitable[Any]]:
         """Execute the task on an experiment item.
 
+        This method defines the core processing logic for each item in your experiment.
+        The implementation should focus on the specific task you want to evaluate,
+        such as text generation, classification, summarization, etc.
+
         Args:
-            item: The experiment or dataset item to process
+            item: The experiment item to process. Can be either:
+                - Dict with keys like 'input', 'expected_output', 'metadata'
+                - Langfuse DatasetItem object with .input, .expected_output attributes
+            **kwargs: Additional keyword arguments that may be passed by the framework
 
         Returns:
-            The task output (can be sync or async)
+            Any: The output of processing the item. This output will be:
+            - Stored in the experiment results
+            - Passed to all item-level evaluators for assessment
+            - Traced automatically in Langfuse for observability
+
+            Can return either a direct value or an awaitable (async) result.
+
+        Examples:
+            Simple synchronous task:
+            ```python
+            def my_task(*, item, **kwargs):
+                prompt = f"Summarize: {item['input']}"
+                return my_llm_client.generate(prompt)
+            ```
+
+            Async task with error handling:
+            ```python
+            async def my_async_task(*, item, **kwargs):
+                try:
+                    response = await openai_client.chat.completions.create(
+                        model="gpt-4",
+                        messages=[{"role": "user", "content": item["input"]}]
+                    )
+                    return response.choices[0].message.content
+                except Exception as e:
+                    # Log error and return fallback
+                    print(f"Task failed for item {item}: {e}")
+                    return "Error: Could not process item"
+            ```
+
+            Task using dataset item attributes:
+            ```python
+            def classification_task(*, item, **kwargs):
+                # Works with both dict items and DatasetItem objects
+                text = item["input"] if isinstance(item, dict) else item.input
+                return classify_text(text)
+            ```
         """
         ...
 
 
 class EvaluatorFunction(Protocol):
-    """Protocol for item-level evaluator functions."""
+    """Protocol defining the interface for item-level evaluator functions.
+
+    Item-level evaluators assess the quality, correctness, or other properties
+    of individual task outputs. They receive the input, output, expected output,
+    and metadata for each item and return evaluation metrics.
+
+    Evaluators should:
+    - Accept input, output, expected_output, and metadata as keyword arguments
+    - Return Evaluation dict(s) with 'name', 'value', 'comment', 'metadata' fields
+    - Be deterministic when possible for reproducible results
+    - Handle edge cases gracefully (missing expected output, malformed data, etc.)
+    - Can be either synchronous or asynchronous
+    """
 
     def __call__(
         self,
@@ -125,22 +295,134 @@ def __call__(
     ) -> Union[
         Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
     ]:
-        """Evaluate a task output.
+        """Evaluate a task output for quality, correctness, or other metrics.
+
+        This method should implement specific evaluation logic such as accuracy checking,
+        similarity measurement, toxicity detection, fluency assessment, etc.
 
         Args:
-            input: The original input to the task
-            output: The output produced by the task
-            expected_output: The expected output (if available)
-            metadata: Optional metadata from the experiment item
+            input: The original input that was passed to the task function.
+                This is typically the item['input'] or item.input value.
+            output: The output produced by the task function for this input.
+                This is the direct return value from your task function.
+            expected_output: The expected/ground truth output for comparison.
+                May be None if not available in the dataset. Evaluators should
+                handle this case appropriately.
+            metadata: Optional metadata from the experiment item that might
+                contain additional context for evaluation (categories, difficulty, etc.)
+            **kwargs: Additional keyword arguments that may be passed by the framework
 
         Returns:
-            Single evaluation or list of evaluations (can be sync or async)
+            Evaluation results in one of these formats:
+            - Single Evaluation dict: {"name": "accuracy", "value": 0.85, "comment": "..."}
+            - List of Evaluation dicts: [{"name": "precision", ...}, {"name": "recall", ...}]
+            - Awaitable returning either of the above (for async evaluators)
+
+            Each Evaluation dict should contain:
+            - name (str): Unique identifier for this evaluation metric
+            - value (int|float|str|bool): The evaluation score or result
+            - comment (str, optional): Human-readable explanation of the result
+            - metadata (dict, optional): Additional structured data about the evaluation
+
+        Examples:
+            Simple accuracy evaluator:
+            ```python
+            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
+                if expected_output is None:
+                    return {"name": "accuracy", "value": None, "comment": "No expected output"}
+
+                is_correct = output.strip().lower() == expected_output.strip().lower()
+                return {
+                    "name": "accuracy",
+                    "value": 1.0 if is_correct else 0.0,
+                    "comment": "Exact match" if is_correct else "No match"
+                }
+            ```
+
+            Multi-metric evaluator:
+            ```python
+            def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
+                results = []
+
+                # Length check
+                results.append({
+                    "name": "output_length",
+                    "value": len(output),
+                    "comment": f"Output contains {len(output)} characters"
+                })
+
+                # Sentiment analysis
+                sentiment_score = analyze_sentiment(output)
+                results.append({
+                    "name": "sentiment",
+                    "value": sentiment_score,
+                    "comment": f"Sentiment score: {sentiment_score:.2f}"
+                })
+
+                return results
+            ```
+
+            Async evaluator using external API:
+            ```python
+            async def llm_judge_evaluator(*, input, output, expected_output=None, **kwargs):
+                prompt = f"Rate the quality of this response on a scale of 1-10:\n"
+                prompt += f"Question: {input}\nResponse: {output}"
+
+                response = await openai_client.chat.completions.create(
+                    model="gpt-4",
+                    messages=[{"role": "user", "content": prompt}]
+                )
+
+                try:
+                    score = float(response.choices[0].message.content.strip())
+                    return {
+                        "name": "llm_judge_quality",
+                        "value": score,
+                        "comment": f"LLM judge rated this {score}/10"
+                    }
+                except ValueError:
+                    return {
+                        "name": "llm_judge_quality",
+                        "value": None,
+                        "comment": "Could not parse LLM judge score"
+                    }
+            ```
+
+            Context-aware evaluator:
+            ```python
+            def context_evaluator(*, input, output, metadata=None, **kwargs):
+                # Use metadata for context-specific evaluation
+                difficulty = metadata.get("difficulty", "medium") if metadata else "medium"
+
+                # Adjust expectations based on difficulty
+                min_length = {"easy": 50, "medium": 100, "hard": 150}[difficulty]
+
+                meets_requirement = len(output) >= min_length
+                return {
+                    "name": f"meets_{difficulty}_requirement",
+                    "value": meets_requirement,
+                    "comment": f"Output {'meets' if meets_requirement else 'fails'} {difficulty} length requirement"
+                }
+            ```
         """
         ...
 
 
 class RunEvaluatorFunction(Protocol):
-    """Protocol for run-level evaluator functions."""
+    """Protocol defining the interface for run-level evaluator functions.
+
+    Run-level evaluators assess aggregate properties of the entire experiment run,
+    computing metrics that span across all items rather than individual outputs.
+    They receive the complete results from all processed items and can compute
+    statistics like averages, distributions, correlations, or other aggregate metrics.
+
+    Run evaluators should:
+    - Accept item_results as a keyword argument containing all item results
+    - Return Evaluation dict(s) with aggregate metrics
+    - Handle cases where some items may have failed processing
+    - Compute meaningful statistics across the dataset
+    - Can be either synchronous or asynchronous
+    """
 
     def __call__(
         self,
@@ -150,13 +432,148 @@ def __call__(
     ) -> Union[
         Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
     ]:
-        """Evaluate the entire experiment run.
+        """Evaluate the entire experiment run with aggregate metrics.
+
+        This method should implement aggregate evaluation logic such as computing
+        averages, calculating distributions, finding correlations, detecting patterns
+        across items, or performing statistical analysis on the experiment results.
 
         Args:
-            item_results: Results from all processed experiment items
+            item_results: List of results from all successfully processed experiment items.
+                Each item result contains:
+                - item: The original experiment item
+                - output: The task function's output for this item
+                - evaluations: List of item-level evaluation results
+                - trace_id: Langfuse trace ID for this execution
+                - dataset_run_id: Dataset run ID (if using Langfuse datasets)
+
+                Note: This list only includes items that were successfully processed.
+                Failed items are excluded but logged separately.
+            **kwargs: Additional keyword arguments that may be passed by the framework
 
         Returns:
-            Single evaluation or list of evaluations (can be sync or async)
+            Evaluation results in one of these formats:
+            - Single Evaluation dict: {"name": "avg_accuracy", "value": 0.78, "comment": "..."}
+            - List of Evaluation dicts: [{"name": "mean", ...}, {"name": "std_dev", ...}]
+            - Awaitable returning either of the above (for async evaluators)
+
+            Each Evaluation dict should contain:
+            - name (str): Unique identifier for this run-level metric
+            - value (int|float|str|bool): The aggregate evaluation result
+            - comment (str, optional): Human-readable explanation of the metric
+            - metadata (dict, optional): Additional structured data about the evaluation
+
+        Examples:
+            Average accuracy calculator:
+            ```python
+            def average_accuracy(*, item_results, **kwargs):
+                if not item_results:
+                    return {"name": "avg_accuracy", "value": 0.0, "comment": "No results"}
+
+                accuracy_values = []
+                for result in item_results:
+                    for evaluation in result["evaluations"]:
+                        if evaluation["name"] == "accuracy":
+                            accuracy_values.append(evaluation["value"])
+
+                if not accuracy_values:
+                    return {"name": "avg_accuracy", "value": None, "comment": "No accuracy evaluations found"}
+
+                avg = sum(accuracy_values) / len(accuracy_values)
+                return {
+                    "name": "avg_accuracy",
+                    "value": avg,
+                    "comment": f"Average accuracy across {len(accuracy_values)} items: {avg:.2%}"
+                }
+            ```
+
+            Multiple aggregate metrics:
+            ```python
+            def statistical_summary(*, item_results, **kwargs):
+                if not item_results:
+                    return []
+
+                results = []
+
+                # Calculate output length statistics
+                lengths = [len(str(result["output"])) for result in item_results]
+                results.extend([
+                    {"name": "avg_output_length", "value": sum(lengths) / len(lengths)},
+                    {"name": "min_output_length", "value": min(lengths)},
+                    {"name": "max_output_length", "value": max(lengths)}
+                ])
+
+                # Success rate
+                total_items = len(item_results)  # Only successful items are included
+                results.append({
+                    "name": "processing_success_rate",
+                    "value": 1.0,  # All items in item_results succeeded
+                    "comment": f"Successfully processed {total_items} items"
+                })
+
+                return results
+            ```
+
+            Async run evaluator with external analysis:
+            ```python
+            async def llm_batch_analysis(*, item_results, **kwargs):
+                # Prepare batch analysis prompt
+                outputs = [result["output"] for result in item_results]
+                prompt = f"Analyze these {len(outputs)} outputs for common themes:\n"
+                prompt += "\n".join(f"{i+1}. {output}" for i, output in enumerate(outputs))
+
+                response = await openai_client.chat.completions.create(
+                    model="gpt-4",
+                    messages=[{"role": "user", "content": prompt}]
+                )
+
+                return {
+                    "name": "thematic_analysis",
+                    "value": response.choices[0].message.content,
+                    "comment": f"LLM analysis of {len(outputs)} outputs"
+                }
+            ```
+
+            Performance distribution analysis:
+            ```python
+            def performance_distribution(*, item_results, **kwargs):
+                # Extract all evaluation scores
+                all_scores = []
+                score_by_metric = {}
+
+                for result in item_results:
+                    for evaluation in result["evaluations"]:
+                        metric_name = evaluation["name"]
+                        value = evaluation["value"]
+
+                        if isinstance(value, (int, float)):
+                            all_scores.append(value)
+                            if metric_name not in score_by_metric:
+                                score_by_metric[metric_name] = []
+                            score_by_metric[metric_name].append(value)
+
+                results = []
+
+                # Overall score distribution
+                if all_scores:
+                    import statistics
+                    results.append({
+                        "name": "score_std_dev",
+                        "value": statistics.stdev(all_scores) if len(all_scores) > 1 else 0,
+                        "comment": f"Standard deviation across all numeric scores"
+                    })
+
+                # Per-metric statistics
+                for metric, scores in score_by_metric.items():
+                    if len(scores) > 1:
+                        results.append({
+                            "name": f"{metric}_variance",
+                            "value": statistics.variance(scores),
+                            "comment": f"Variance in {metric} across {len(scores)} items"
+                        })
+
+                return results
+            ```
         """
         ...
 
diff --git a/langfuse/types.py b/langfuse/types.py
index b654fffed..8a186a345 100644
--- a/langfuse/types.py
+++ b/langfuse/types.py
@@ -1,4 +1,21 @@
-"""@private"""
+"""Public API for all Langfuse types.
+
+This module provides a centralized location for importing commonly used types
+from the Langfuse SDK, making them easily accessible without requiring nested imports.
+
+Example:
+    ```python
+    from langfuse.types import Evaluation, LocalExperimentItem, TaskFunction
+
+    # Define your task function
+    def my_task(*, item: LocalExperimentItem, **kwargs) -> str:
+        return f"Processed: {item['input']}"
+
+    # Define your evaluator
+    def my_evaluator(*, output: str, **kwargs) -> Evaluation:
+        return {"name": "length", "value": len(output)}
+    ```
+"""
 
 from datetime import datetime
 from typing import (
@@ -22,6 +39,19 @@
 from langfuse.api import MediaContentType, UsageDetails
 from langfuse.model import MapValue, ModelUsage, PromptClient
 
+# Experiment types
+from ._client.experiments import (
+    LocalExperimentItem,
+    ExperimentItem,
+    ExperimentData,
+    Evaluation,
+    ExperimentItemResult,
+    ExperimentResult,
+    TaskFunction,
+    EvaluatorFunction,
+    RunEvaluatorFunction,
+)
+
 SpanLevel = Literal["DEBUG", "DEFAULT", "WARNING", "ERROR"]
 
 ScoreDataType = Literal["NUMERIC", "CATEGORICAL", "BOOLEAN"]
@@ -84,3 +114,26 @@ class ParsedMediaReference(TypedDict):
 class TraceContext(TypedDict):
     trace_id: str
     parent_span_id: NotRequired[str]
+
+
+# Export experiment types for easy access
+__all__ = [
+    # Experiment types
+    "LocalExperimentItem",
+    "ExperimentItem",
+    "ExperimentData",
+    "Evaluation",
+    "ExperimentItemResult",
+    "ExperimentResult",
+    "TaskFunction",
+    "EvaluatorFunction",
+    "RunEvaluatorFunction",
+    # Core types (keeping existing functionality)
+    "SpanLevel",
+    "ScoreDataType",
+    "TraceMetadata",
+    "ObservationParams",
+    "MaskFunction",
+    "ParsedMediaReference",
+    "TraceContext",
+]

From b8b2f8c304aafca556e89ff1da51135c795a2cb6 Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Thu, 11 Sep 2025 18:16:52 +0200
Subject: [PATCH 08/25] add run safe async

---
 langfuse/_client/client.py   |  28 ++--
 langfuse/_client/datasets.py |   2 +
 langfuse/_client/utils.py    |  69 +++++++++-
 tests/test_utils.py          | 254 +++++++++++++++++++++++++++++++++++
 4 files changed, 341 insertions(+), 12 deletions(-)
 create mode 100644 tests/test_utils.py

diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py
index 6d17ba0dc..b51402951 100644
--- a/langfuse/_client/client.py
+++ b/langfuse/_client/client.py
@@ -80,6 +80,7 @@
     LangfuseSpan,
     LangfuseTool,
 )
+from langfuse._client.utils import run_async_safely
 from langfuse._utils import _get_timestamp
 from langfuse._utils.parse_error import handle_fern_exception
 from langfuse._utils.prompt_cache import PromptCache
@@ -2617,18 +2618,23 @@ def average_accuracy(*, item_results, **kwargs):
             - Individual item failures are logged but don't stop the experiment
             - All executions are automatically traced and visible in Langfuse UI
             - When using Langfuse datasets, results are automatically linked for easy comparison
+            - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
+            - Async execution is handled automatically with smart event loop detection
         """
-        return asyncio.run(
-            self._run_experiment_async(
-                name=name,
-                description=description,
-                data=data,
-                task=task,
-                evaluators=evaluators or [],
-                run_evaluators=run_evaluators or [],
-                max_concurrency=max_concurrency,
-                metadata=metadata or {},
-            )
+        return cast(
+            ExperimentResult,
+            run_async_safely(
+                self._run_experiment_async(
+                    name=name,
+                    description=description,
+                    data=data,
+                    task=task,
+                    evaluators=evaluators or [],
+                    run_evaluators=run_evaluators or [],
+                    max_concurrency=max_concurrency,
+                    metadata=metadata or {},
+                ),
+            ),
         )
 
     async def _run_experiment_async(
diff --git a/langfuse/_client/datasets.py b/langfuse/_client/datasets.py
index cab0d98b6..023b7f947 100644
--- a/langfuse/_client/datasets.py
+++ b/langfuse/_client/datasets.py
@@ -382,6 +382,8 @@ def content_diversity(*, item_results, **kwargs):
             - Results can be easily compared across different experiment runs in the UI
             - The dataset_run_url provides direct access to detailed results and analysis
             - Failed items are handled gracefully and logged without stopping the experiment
+            - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
+            - Async execution is handled automatically with smart event loop detection
         """
         langfuse_client = self._get_langfuse_client()
         if not langfuse_client:
diff --git a/langfuse/_client/utils.py b/langfuse/_client/utils.py
index dac7a3f1b..d34857ebd 100644
--- a/langfuse/_client/utils.py
+++ b/langfuse/_client/utils.py
@@ -1,10 +1,13 @@
 """Utility functions for Langfuse OpenTelemetry integration.
 
 This module provides utility functions for working with OpenTelemetry spans,
-including formatting and serialization of span data.
+including formatting and serialization of span data, and async execution helpers.
 """
 
+import asyncio
 import json
+import threading
+from typing import Any, Coroutine
 
 from opentelemetry import trace as otel_trace_api
 from opentelemetry.sdk import util
@@ -58,3 +61,67 @@ def span_formatter(span: ReadableSpan) -> str:
         )
         + "\n"
     )
+
+
+class _RunAsyncThread(threading.Thread):
+    """Helper thread class for running async coroutines in a separate thread."""
+
+    def __init__(self, coro: Coroutine[Any, Any, Any]) -> None:
+        self.coro = coro
+        self.result: Any = None
+        self.exception: Exception | None = None
+        super().__init__()
+
+    def run(self) -> None:
+        try:
+            self.result = asyncio.run(self.coro)
+        except Exception as e:
+            self.exception = e
+
+
+def run_async_safely(coro: Coroutine[Any, Any, Any]) -> Any:
+    """Safely run an async coroutine, handling existing event loops.
+
+    This function detects if there's already a running event loop and uses
+    a separate thread if needed to avoid the "asyncio.run() cannot be called
+    from a running event loop" error. This is particularly useful in environments
+    like Jupyter notebooks, FastAPI applications, or other async frameworks.
+
+    Args:
+        coro: The coroutine to run
+
+    Returns:
+        The result of the coroutine
+
+    Raises:
+        Any exception raised by the coroutine
+
+    Example:
+        ```python
+        # Works in both sync and async contexts
+        async def my_async_function():
+            await asyncio.sleep(1)
+            return "done"
+
+        result = run_async_safely(my_async_function())
+        ```
+    """
+    try:
+        # Check if there's already a running event loop
+        loop = asyncio.get_running_loop()
+    except RuntimeError:
+        # No running loop, safe to use asyncio.run()
+        return asyncio.run(coro)
+
+    if loop and loop.is_running():
+        # There's a running loop, use a separate thread
+        thread = _RunAsyncThread(coro)
+        thread.start()
+        thread.join()
+
+        if thread.exception:
+            raise thread.exception
+        return thread.result
+    else:
+        # Loop exists but not running, safe to use asyncio.run()
+        return asyncio.run(coro)
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 000000000..ac3ee8473
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,254 @@
+"""Test suite for utility functions in langfuse._client.utils module."""
+
+import asyncio
+import threading
+from unittest import mock
+
+import pytest
+
+from langfuse._client.utils import run_async_safely
+
+
+class TestRunAsyncSafely:
+    """Test suite for the run_async_safely function."""
+
+    def test_run_sync_context_simple(self):
+        """Test run_async_safely in sync context with simple coroutine."""
+
+        async def simple_coro():
+            await asyncio.sleep(0.01)
+            return "hello"
+
+        result = run_async_safely(simple_coro())
+        assert result == "hello"
+
+    def test_run_sync_context_with_value(self):
+        """Test run_async_safely in sync context with parameter passing."""
+
+        async def coro_with_params(value, multiplier=2):
+            await asyncio.sleep(0.01)
+            return value * multiplier
+
+        result = run_async_safely(coro_with_params(5, multiplier=3))
+        assert result == 15
+
+    def test_run_sync_context_with_exception(self):
+        """Test run_async_safely properly propagates exceptions in sync context."""
+
+        async def failing_coro():
+            await asyncio.sleep(0.01)
+            raise ValueError("Test error")
+
+        with pytest.raises(ValueError, match="Test error"):
+            run_async_safely(failing_coro())
+
+    @pytest.mark.asyncio
+    async def test_run_async_context_simple(self):
+        """Test run_async_safely from within async context (uses threading)."""
+
+        async def simple_coro():
+            await asyncio.sleep(0.01)
+            return "from_thread"
+
+        # This should use threading since we're already in an async context
+        result = run_async_safely(simple_coro())
+        assert result == "from_thread"
+
+    @pytest.mark.asyncio
+    async def test_run_async_context_with_exception(self):
+        """Test run_async_safely properly propagates exceptions from thread."""
+
+        async def failing_coro():
+            await asyncio.sleep(0.01)
+            raise RuntimeError("Thread error")
+
+        with pytest.raises(RuntimeError, match="Thread error"):
+            run_async_safely(failing_coro())
+
+    @pytest.mark.asyncio
+    async def test_run_async_context_thread_isolation(self):
+        """Test that threaded execution is properly isolated."""
+        # Set a thread-local value in the main async context
+        threading.current_thread().test_value = "main_thread"
+
+        async def check_thread_isolation():
+            # This should run in a different thread
+            current_thread = threading.current_thread()
+            # Should not have the test_value from main thread
+            assert not hasattr(current_thread, "test_value")
+            return "isolated"
+
+        result = run_async_safely(check_thread_isolation())
+        assert result == "isolated"
+
+    def test_multiple_calls_sync_context(self):
+        """Test multiple sequential calls in sync context."""
+
+        async def counter_coro(count):
+            await asyncio.sleep(0.001)
+            return count * 2
+
+        results = []
+        for i in range(5):
+            result = run_async_safely(counter_coro(i))
+            results.append(result)
+
+        assert results == [0, 2, 4, 6, 8]
+
+    @pytest.mark.asyncio
+    async def test_multiple_calls_async_context(self):
+        """Test multiple sequential calls in async context (each uses threading)."""
+
+        async def counter_coro(count):
+            await asyncio.sleep(0.001)
+            return count * 3
+
+        results = []
+        for i in range(3):
+            result = run_async_safely(counter_coro(i))
+            results.append(result)
+
+        assert results == [0, 3, 6]
+
+    def test_concurrent_calls_sync_context(self):
+        """Test concurrent calls in sync context using threading."""
+
+        async def slow_coro(value):
+            await asyncio.sleep(0.02)
+            return value**2
+
+        import concurrent.futures
+
+        with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
+            futures = []
+            for i in range(3):
+                future = executor.submit(run_async_safely, slow_coro(i + 1))
+                futures.append(future)
+
+            results = [future.result() for future in futures]
+
+        # Results should be squares: 1^2, 2^2, 3^2
+        assert sorted(results) == [1, 4, 9]
+
+    def test_event_loop_detection_mock(self):
+        """Test event loop detection logic with mocking."""
+
+        async def simple_coro():
+            return "mocked"
+
+        # Mock no running loop - should use asyncio.run
+        with mock.patch(
+            "asyncio.get_running_loop", side_effect=RuntimeError("No loop")
+        ):
+            with mock.patch(
+                "asyncio.run", return_value="asyncio_run_called"
+            ) as mock_run:
+                result = run_async_safely(simple_coro())
+                assert result == "asyncio_run_called"
+                mock_run.assert_called_once()
+
+    def test_complex_coroutine(self):
+        """Test with a more complex coroutine that does actual async work."""
+
+        async def complex_coro():
+            # Simulate some async operations
+            results = []
+            for i in range(3):
+                await asyncio.sleep(0.001)
+                results.append(i**2)
+
+            # Simulate concurrent operations
+            async def sub_task(x):
+                await asyncio.sleep(0.001)
+                return x * 10
+
+            tasks = [sub_task(x) for x in range(2)]
+            concurrent_results = await asyncio.gather(*tasks)
+            results.extend(concurrent_results)
+
+            return results
+
+        result = run_async_safely(complex_coro())
+        assert result == [0, 1, 4, 0, 10]  # [0^2, 1^2, 2^2, 0*10, 1*10]
+
+    @pytest.mark.asyncio
+    async def test_nested_async_calls(self):
+        """Test that nested calls to run_async_safely work correctly."""
+
+        async def inner_coro(value):
+            await asyncio.sleep(0.001)
+            return value * 2
+
+        async def outer_coro(value):
+            # This is already in an async context, so the inner call
+            # will also use threading
+            inner_result = run_async_safely(inner_coro(value))
+            await asyncio.sleep(0.001)
+            return inner_result + 1
+
+        result = run_async_safely(outer_coro(5))
+        assert result == 11  # (5 * 2) + 1
+
+    def test_exception_types_preserved(self):
+        """Test that different exception types are properly preserved."""
+
+        async def custom_exception_coro():
+            await asyncio.sleep(0.001)
+
+            class CustomError(Exception):
+                pass
+
+            raise CustomError("Custom error message")
+
+        with pytest.raises(Exception) as exc_info:
+            run_async_safely(custom_exception_coro())
+
+        # The exception type should be preserved
+        assert "Custom error message" in str(exc_info.value)
+
+    def test_return_types_preserved(self):
+        """Test that various return types are properly preserved."""
+
+        async def dict_coro():
+            await asyncio.sleep(0.001)
+            return {"key": "value", "number": 42}
+
+        async def list_coro():
+            await asyncio.sleep(0.001)
+            return [1, 2, 3, "string"]
+
+        async def none_coro():
+            await asyncio.sleep(0.001)
+            return None
+
+        dict_result = run_async_safely(dict_coro())
+        assert dict_result == {"key": "value", "number": 42}
+        assert isinstance(dict_result, dict)
+
+        list_result = run_async_safely(list_coro())
+        assert list_result == [1, 2, 3, "string"]
+        assert isinstance(list_result, list)
+
+        none_result = run_async_safely(none_coro())
+        assert none_result is None
+
+    @pytest.mark.asyncio
+    async def test_real_world_scenario_jupyter_simulation(self):
+        """Test scenario simulating Jupyter notebook environment."""
+        # This simulates being called from a Jupyter cell where there's
+        # already an event loop running
+
+        async def simulate_llm_call(prompt):
+            """Simulate an LLM API call."""
+            await asyncio.sleep(0.01)  # Simulate network delay
+            return f"Response to: {prompt}"
+
+        async def simulate_experiment_task(item):
+            """Simulate an experiment task function."""
+            response = await simulate_llm_call(item["input"])
+            await asyncio.sleep(0.001)  # Additional processing
+            return response
+
+        # This should work even though we're in an async context
+        result = run_async_safely(simulate_experiment_task({"input": "test prompt"}))
+        assert result == "Response to: test prompt"

From db09d7fa7ecda28c75ee213d30137d94efe57f16 Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Fri, 12 Sep 2025 10:29:15 +0200
Subject: [PATCH 09/25] push

---
 langfuse/_client/client.py      | 3 +++
 langfuse/_client/experiments.py | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py
index b51402951..27958b967 100644
--- a/langfuse/_client/client.py
+++ b/langfuse/_client/client.py
@@ -2722,6 +2722,9 @@ async def process_item(item: ExperimentItem) -> dict:
             except Exception as e:
                 langfuse_logger.error(f"Failed to store run evaluation: {e}")
 
+        # Flush scores and traces
+        self.flush()
+
         return {
             "item_results": valid_results,
             "run_evaluations": run_evaluations,
diff --git a/langfuse/_client/experiments.py b/langfuse/_client/experiments.py
index 8628da489..5833af70a 100644
--- a/langfuse/_client/experiments.py
+++ b/langfuse/_client/experiments.py
@@ -289,8 +289,8 @@ def __call__(
         *,
         input: Any,
         output: Any,
-        expected_output: Any = None,
-        metadata: Optional[Dict[str, Any]] = None,
+        expected_output: Any,
+        metadata: Optional[Dict[str, Any]],
         **kwargs: Dict[str, Any],
     ) -> Union[
         Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]

From 285cc99d5f9eadc4b576cf395371c855a6215fb2 Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Fri, 12 Sep 2025 10:31:45 +0200
Subject: [PATCH 10/25] push

---
 langfuse/_task_manager/media_manager.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/langfuse/_task_manager/media_manager.py b/langfuse/_task_manager/media_manager.py
index a36e3b8af..1a32e3d60 100644
--- a/langfuse/_task_manager/media_manager.py
+++ b/langfuse/_task_manager/media_manager.py
@@ -49,7 +49,6 @@ def process_next_media_upload(self) -> None:
 
             self._queue.task_done()
         except Empty:
-            self._log.debug("Queue: Media upload queue is empty, waiting for new jobs")
             pass
         except Exception as e:
             self._log.error(
@@ -248,7 +247,7 @@ def _process_upload_media_job(
 
         headers = {"Content-Type": data["content_type"]}
 
-        # In self-hosted setups with GCP, do not add unsupported headers that fail the upload 
+        # In self-hosted setups with GCP, do not add unsupported headers that fail the upload
         is_self_hosted_gcs_bucket = "storage.googleapis.com" in upload_url
 
         if not is_self_hosted_gcs_bucket:

From f94dab3b9494eb78c42b1b5d53d7e92f2b04b1de Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Fri, 12 Sep 2025 14:08:32 +0200
Subject: [PATCH 11/25] add autoevals adapter

---
 langfuse/_client/experiments.py |  32 +++
 langfuse/experiment.py          |  25 ++
 langfuse/types.py               |  25 --
 poetry.lock                     | 460 ++++++++++++++++++++++++--------
 pyproject.toml                  |   1 +
 5 files changed, 401 insertions(+), 142 deletions(-)
 create mode 100644 langfuse/experiment.py

diff --git a/langfuse/_client/experiments.py b/langfuse/_client/experiments.py
index 5833af70a..2e004d686 100644
--- a/langfuse/_client/experiments.py
+++ b/langfuse/_client/experiments.py
@@ -747,3 +747,35 @@ async def _run_task(task: TaskFunction, item: ExperimentItem) -> Any:
         result = await result
 
     return result
+
+
+def create_evaluator_from_autoevals(
+    autoevals_evaluator: Any, **kwargs: Optional[Dict[str, Any]]
+) -> EvaluatorFunction:
+    """Create a Langfuse evaluator from an autoevals evaluator.
+
+    Args:
+        autoevals_evaluator: An autoevals evaluator instance
+        **kwargs: Additional arguments passed to the evaluator
+
+    Returns:
+        A Langfuse-compatible evaluator function
+    """
+
+    def langfuse_evaluator(
+        *,
+        input: Any,
+        output: Any,
+        expected_output: Any,
+        metadata: Optional[Dict[str, Any]],
+        **kwargs: Dict[str, Any],
+    ) -> Evaluation:
+        evaluation = autoevals_evaluator(
+            input=input, output=output, expected=expected_output, **kwargs
+        )
+
+        return Evaluation(
+            name=evaluation.name, value=evaluation.score, metadata=evaluation.metadata
+        )
+
+    return langfuse_evaluator
diff --git a/langfuse/experiment.py b/langfuse/experiment.py
new file mode 100644
index 000000000..2d54255e2
--- /dev/null
+++ b/langfuse/experiment.py
@@ -0,0 +1,25 @@
+from ._client.experiments import (
+    Evaluation,
+    EvaluatorFunction,
+    ExperimentData,
+    ExperimentItem,
+    ExperimentItemResult,
+    ExperimentResult,
+    LocalExperimentItem,
+    RunEvaluatorFunction,
+    TaskFunction,
+    create_evaluator_from_autoevals,
+)
+
+__all__ = [
+    "LocalExperimentItem",
+    "ExperimentItem",
+    "ExperimentData",
+    "Evaluation",
+    "ExperimentItemResult",
+    "ExperimentResult",
+    "TaskFunction",
+    "EvaluatorFunction",
+    "RunEvaluatorFunction",
+    "create_evaluator_from_autoevals",
+]
diff --git a/langfuse/types.py b/langfuse/types.py
index 8a186a345..32ebb32d4 100644
--- a/langfuse/types.py
+++ b/langfuse/types.py
@@ -39,19 +39,6 @@ def my_evaluator(*, output: str, **kwargs) -> Evaluation:
 from langfuse.api import MediaContentType, UsageDetails
 from langfuse.model import MapValue, ModelUsage, PromptClient
 
-# Experiment types
-from ._client.experiments import (
-    LocalExperimentItem,
-    ExperimentItem,
-    ExperimentData,
-    Evaluation,
-    ExperimentItemResult,
-    ExperimentResult,
-    TaskFunction,
-    EvaluatorFunction,
-    RunEvaluatorFunction,
-)
-
 SpanLevel = Literal["DEBUG", "DEFAULT", "WARNING", "ERROR"]
 
 ScoreDataType = Literal["NUMERIC", "CATEGORICAL", "BOOLEAN"]
@@ -116,19 +103,7 @@ class TraceContext(TypedDict):
     parent_span_id: NotRequired[str]
 
 
-# Export experiment types for easy access
 __all__ = [
-    # Experiment types
-    "LocalExperimentItem",
-    "ExperimentItem",
-    "ExperimentData",
-    "Evaluation",
-    "ExperimentItemResult",
-    "ExperimentResult",
-    "TaskFunction",
-    "EvaluatorFunction",
-    "RunEvaluatorFunction",
-    # Core types (keeping existing functionality)
     "SpanLevel",
     "ScoreDataType",
     "TraceMetadata",
diff --git a/poetry.lock b/poetry.lock
index 3380643bd..2cdb8e476 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
 
 [[package]]
 name = "annotated-types"
@@ -6,7 +6,6 @@ version = "0.7.0"
 description = "Reusable constraint types to use with typing.Annotated"
 optional = false
 python-versions = ">=3.8"
-groups = ["main", "dev"]
 files = [
     {file = "annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53"},
     {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"},
@@ -18,7 +17,6 @@ version = "4.10.0"
 description = "High-level concurrency and networking framework on top of asyncio or Trio"
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "dev"]
 files = [
     {file = "anyio-4.10.0-py3-none-any.whl", hash = "sha256:60e474ac86736bbfd6f210f7a61218939c318f43f9972497381f1c5e930ed3d1"},
     {file = "anyio-4.10.0.tar.gz", hash = "sha256:3f3fae35c96039744587aa5b8371e7e8e603c0702999535961dd336026973ba6"},
@@ -39,20 +37,59 @@ version = "4.0.3"
 description = "Timeout context manager for asyncio programs"
 optional = true
 python-versions = ">=3.7"
-groups = ["main"]
-markers = "extra == \"langchain\" and python_version < \"3.11\""
 files = [
     {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"},
     {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
 ]
 
+[[package]]
+name = "attrs"
+version = "25.3.0"
+description = "Classes Without Boilerplate"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "attrs-25.3.0-py3-none-any.whl", hash = "sha256:427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3"},
+    {file = "attrs-25.3.0.tar.gz", hash = "sha256:75d7cefc7fb576747b2c81b4442d4d4a1ce0900973527c011d1030fd3bf4af1b"},
+]
+
+[package.extras]
+benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit-uv", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier"]
+tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"]
+
+[[package]]
+name = "autoevals"
+version = "0.0.130"
+description = "Universal library for evaluating AI models"
+optional = false
+python-versions = ">=3.8.0"
+files = [
+    {file = "autoevals-0.0.130-py3-none-any.whl", hash = "sha256:ffb7b3a21070d2a4e593bb118180c04e43531e608bffd854624377bd857ceec0"},
+    {file = "autoevals-0.0.130.tar.gz", hash = "sha256:92f87ab95a575b56d9d7377e6f1399932d09180d2f3a8266b4f693f46f49b86d"},
+]
+
+[package.dependencies]
+chevron = "*"
+jsonschema = "*"
+polyleven = "*"
+pyyaml = "*"
+
+[package.extras]
+all = ["IPython", "black (==22.6.0)", "braintrust", "build", "flake8", "flake8-isort", "isort (==5.12.0)", "numpy", "openai", "pre-commit", "pydoc-markdown", "pytest", "respx", "scipy", "twine"]
+dev = ["IPython", "black (==22.6.0)", "braintrust", "build", "flake8", "flake8-isort", "isort (==5.12.0)", "openai", "pre-commit", "pytest", "respx", "twine"]
+doc = ["pydoc-markdown"]
+scipy = ["numpy", "scipy"]
+
 [[package]]
 name = "backoff"
 version = "2.2.1"
 description = "Function decoration for backoff and retry"
 optional = false
 python-versions = ">=3.7,<4.0"
-groups = ["main"]
 files = [
     {file = "backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8"},
     {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"},
@@ -64,8 +101,6 @@ version = "1.2.0"
 description = "Backport of asyncio.Runner, a context manager that controls event loop life cycle."
 optional = false
 python-versions = "<3.11,>=3.8"
-groups = ["dev"]
-markers = "python_version < \"3.11\""
 files = [
     {file = "backports_asyncio_runner-1.2.0-py3-none-any.whl", hash = "sha256:0da0a936a8aeb554eccb426dc55af3ba63bcdc69fa1a600b5bb305413a4477b5"},
     {file = "backports_asyncio_runner-1.2.0.tar.gz", hash = "sha256:a5aa7b2b7d8f8bfcaa2b57313f70792df84e32a2a746f585213373f900b42162"},
@@ -77,7 +112,6 @@ version = "2025.8.3"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.7"
-groups = ["main", "dev"]
 files = [
     {file = "certifi-2025.8.3-py3-none-any.whl", hash = "sha256:f6c12493cfb1b06ba2ff328595af9350c65d6644968e5d3a2ffd78699af217a5"},
     {file = "certifi-2025.8.3.tar.gz", hash = "sha256:e564105f78ded564e3ae7c923924435e1daa7463faeab5bb932bc53ffae63407"},
@@ -89,7 +123,6 @@ version = "3.4.0"
 description = "Validate configuration and produce human readable error messages."
 optional = false
 python-versions = ">=3.8"
-groups = ["dev"]
 files = [
     {file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"},
     {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"},
@@ -101,7 +134,6 @@ version = "3.4.3"
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
 optional = false
 python-versions = ">=3.7"
-groups = ["main", "dev"]
 files = [
     {file = "charset_normalizer-3.4.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:fb7f67a1bfa6e40b438170ebdc8158b78dc465a5a67b6dde178a46987b244a72"},
     {file = "charset_normalizer-3.4.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cc9370a2da1ac13f0153780040f465839e6cccb4a1e44810124b4e22483c93fe"},
@@ -184,18 +216,27 @@ files = [
     {file = "charset_normalizer-3.4.3.tar.gz", hash = "sha256:6fce4b8500244f6fcb71465d4a4930d132ba9ab8e71a7859e6a5d59851068d14"},
 ]
 
+[[package]]
+name = "chevron"
+version = "0.14.0"
+description = "Mustache templating language renderer"
+optional = false
+python-versions = "*"
+files = [
+    {file = "chevron-0.14.0-py3-none-any.whl", hash = "sha256:fbf996a709f8da2e745ef763f482ce2d311aa817d287593a5b990d6d6e4f0443"},
+    {file = "chevron-0.14.0.tar.gz", hash = "sha256:87613aafdf6d77b6a90ff073165a61ae5086e21ad49057aa0e53681601800ebf"},
+]
+
 [[package]]
 name = "colorama"
 version = "0.4.6"
 description = "Cross-platform colored terminal text."
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
-groups = ["main", "dev"]
 files = [
     {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
     {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
 ]
-markers = {main = "extra == \"openai\" and platform_system == \"Windows\"", dev = "platform_system == \"Windows\" or sys_platform == \"win32\""}
 
 [[package]]
 name = "distlib"
@@ -203,7 +244,6 @@ version = "0.4.0"
 description = "Distribution utilities"
 optional = false
 python-versions = "*"
-groups = ["dev"]
 files = [
     {file = "distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16"},
     {file = "distlib-0.4.0.tar.gz", hash = "sha256:feec40075be03a04501a973d81f633735b4b69f98b05450592310c0f401a4e0d"},
@@ -215,12 +255,10 @@ version = "1.9.0"
 description = "Distro - an OS platform information API"
 optional = false
 python-versions = ">=3.6"
-groups = ["main", "dev"]
 files = [
     {file = "distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2"},
     {file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"},
 ]
-markers = {main = "extra == \"openai\""}
 
 [[package]]
 name = "exceptiongroup"
@@ -228,8 +266,6 @@ version = "1.3.0"
 description = "Backport of PEP 654 (exception groups)"
 optional = false
 python-versions = ">=3.7"
-groups = ["main", "dev"]
-markers = "python_version < \"3.11\""
 files = [
     {file = "exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10"},
     {file = "exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88"},
@@ -247,7 +283,6 @@ version = "2.1.1"
 description = "execnet: rapid multi-Python deployment"
 optional = false
 python-versions = ">=3.8"
-groups = ["dev"]
 files = [
     {file = "execnet-2.1.1-py3-none-any.whl", hash = "sha256:26dee51f1b80cebd6d0ca8e74dd8745419761d3bef34163928cbebbdc4749fdc"},
     {file = "execnet-2.1.1.tar.gz", hash = "sha256:5189b52c6121c24feae288166ab41b32549c7e2348652736540b9e6e7d4e72e3"},
@@ -262,7 +297,6 @@ version = "3.19.1"
 description = "A platform independent file lock."
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
 files = [
     {file = "filelock-3.19.1-py3-none-any.whl", hash = "sha256:d38e30481def20772f5baf097c122c3babc4fcdb7e14e57049eb9d88c6dc017d"},
     {file = "filelock-3.19.1.tar.gz", hash = "sha256:66eda1888b0171c998b35be2bcc0f6d75c388a7ce20c3f3f37aa8e96c2dddf58"},
@@ -274,7 +308,6 @@ version = "1.70.0"
 description = "Common protobufs used in Google APIs"
 optional = false
 python-versions = ">=3.7"
-groups = ["main"]
 files = [
     {file = "googleapis_common_protos-1.70.0-py3-none-any.whl", hash = "sha256:b8bfcca8c25a2bb253e0e0b0adaf8c00773e5e6af6fd92397576680b807e0fd8"},
     {file = "googleapis_common_protos-1.70.0.tar.gz", hash = "sha256:0e1b44e0ea153e6594f9f394fef15193a68aaaea2d843f83e2742717ca753257"},
@@ -292,8 +325,6 @@ version = "3.2.4"
 description = "Lightweight in-process concurrent programming"
 optional = true
 python-versions = ">=3.9"
-groups = ["main"]
-markers = "python_version < \"3.14\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\") and extra == \"langchain\""
 files = [
     {file = "greenlet-3.2.4-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:8c68325b0d0acf8d91dde4e6f930967dd52a5302cd4062932a6b2e7c2969f47c"},
     {file = "greenlet-3.2.4-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:94385f101946790ae13da500603491f04a76b6e4c059dab271b3ce2e283b2590"},
@@ -361,7 +392,6 @@ version = "0.16.0"
 description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
 optional = false
 python-versions = ">=3.8"
-groups = ["main", "dev"]
 files = [
     {file = "h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86"},
     {file = "h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1"},
@@ -373,7 +403,6 @@ version = "1.0.9"
 description = "A minimal low-level HTTP client."
 optional = false
 python-versions = ">=3.8"
-groups = ["main", "dev"]
 files = [
     {file = "httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55"},
     {file = "httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8"},
@@ -395,7 +424,6 @@ version = "0.28.1"
 description = "The next generation HTTP client."
 optional = false
 python-versions = ">=3.8"
-groups = ["main", "dev"]
 files = [
     {file = "httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad"},
     {file = "httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc"},
@@ -408,7 +436,7 @@ httpcore = "==1.*"
 idna = "*"
 
 [package.extras]
-brotli = ["brotli ; platform_python_implementation == \"CPython\"", "brotlicffi ; platform_python_implementation != \"CPython\""]
+brotli = ["brotli", "brotlicffi"]
 cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"]
 http2 = ["h2 (>=3,<5)"]
 socks = ["socksio (==1.*)"]
@@ -420,7 +448,6 @@ version = "2.6.13"
 description = "File identification library for Python"
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
 files = [
     {file = "identify-2.6.13-py2.py3-none-any.whl", hash = "sha256:60381139b3ae39447482ecc406944190f690d4a2997f2584062089848361b33b"},
     {file = "identify-2.6.13.tar.gz", hash = "sha256:da8d6c828e773620e13bfa86ea601c5a5310ba4bcd65edf378198b56a1f9fb32"},
@@ -435,7 +462,6 @@ version = "3.10"
 description = "Internationalized Domain Names in Applications (IDNA)"
 optional = false
 python-versions = ">=3.6"
-groups = ["main", "dev"]
 files = [
     {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"},
     {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"},
@@ -450,7 +476,6 @@ version = "8.7.0"
 description = "Read metadata from Python packages"
 optional = false
 python-versions = ">=3.9"
-groups = ["main"]
 files = [
     {file = "importlib_metadata-8.7.0-py3-none-any.whl", hash = "sha256:e5dd1551894c77868a30651cef00984d50e1002d06942a7101d34870c5f02afd"},
     {file = "importlib_metadata-8.7.0.tar.gz", hash = "sha256:d13b81ad223b890aa16c5471f2ac3056cf76c5f10f82d6f9292f0b415f389000"},
@@ -460,12 +485,12 @@ files = [
 zipp = ">=3.20"
 
 [package.extras]
-check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\""]
+check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"]
 cover = ["pytest-cov"]
 doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
 enabler = ["pytest-enabler (>=2.2)"]
 perf = ["ipython"]
-test = ["flufl.flake8", "importlib_resources (>=1.3) ; python_version < \"3.9\"", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"]
+test = ["flufl.flake8", "importlib_resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"]
 type = ["pytest-mypy"]
 
 [[package]]
@@ -474,7 +499,6 @@ version = "2.1.0"
 description = "brain-dead simple config-ini parsing"
 optional = false
 python-versions = ">=3.8"
-groups = ["dev"]
 files = [
     {file = "iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760"},
     {file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"},
@@ -486,7 +510,6 @@ version = "3.1.6"
 description = "A very fast and expressive template engine."
 optional = false
 python-versions = ">=3.7"
-groups = ["docs"]
 files = [
     {file = "jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67"},
     {file = "jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d"},
@@ -504,7 +527,6 @@ version = "0.10.0"
 description = "Fast iterable JSON parser."
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "dev"]
 files = [
     {file = "jiter-0.10.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:cd2fb72b02478f06a900a5782de2ef47e0396b3e1f7d5aba30daeb1fce66f303"},
     {file = "jiter-0.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:32bb468e3af278f095d3fa5b90314728a6916d89ba3d0ffb726dd9bf7367285e"},
@@ -584,7 +606,6 @@ files = [
     {file = "jiter-0.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:1b28302349dc65703a9e4ead16f163b1c339efffbe1049c30a44b001a2a4fff9"},
     {file = "jiter-0.10.0.tar.gz", hash = "sha256:07a7142c38aacc85194391108dc91b5b57093c978a9932bd86a36862759d9500"},
 ]
-markers = {main = "extra == \"openai\""}
 
 [[package]]
 name = "jsonpatch"
@@ -592,12 +613,10 @@ version = "1.33"
 description = "Apply JSON-Patches (RFC 6902)"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
-groups = ["main", "dev"]
 files = [
     {file = "jsonpatch-1.33-py2.py3-none-any.whl", hash = "sha256:0ae28c0cd062bbd8b8ecc26d7d164fbbea9652a1a3693f3b956c1eae5145dade"},
     {file = "jsonpatch-1.33.tar.gz", hash = "sha256:9fcd4009c41e6d12348b4a0ff2563ba56a2923a7dfee731d004e212e1ee5030c"},
 ]
-markers = {main = "extra == \"langchain\""}
 
 [package.dependencies]
 jsonpointer = ">=1.9"
@@ -608,12 +627,45 @@ version = "3.0.0"
 description = "Identify specific nodes in a JSON document (RFC 6901)"
 optional = false
 python-versions = ">=3.7"
-groups = ["main", "dev"]
 files = [
     {file = "jsonpointer-3.0.0-py2.py3-none-any.whl", hash = "sha256:13e088adc14fca8b6aa8177c044e12701e6ad4b28ff10e65f2267a90109c9942"},
     {file = "jsonpointer-3.0.0.tar.gz", hash = "sha256:2b2d729f2091522d61c3b31f82e11870f60b68f43fbc705cb76bf4b832af59ef"},
 ]
-markers = {main = "extra == \"langchain\""}
+
+[[package]]
+name = "jsonschema"
+version = "4.25.1"
+description = "An implementation of JSON Schema validation for Python"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "jsonschema-4.25.1-py3-none-any.whl", hash = "sha256:3fba0169e345c7175110351d456342c364814cfcf3b964ba4587f22915230a63"},
+    {file = "jsonschema-4.25.1.tar.gz", hash = "sha256:e4a9655ce0da0c0b67a085847e00a3a51449e1157f4f75e9fb5aa545e122eb85"},
+]
+
+[package.dependencies]
+attrs = ">=22.2.0"
+jsonschema-specifications = ">=2023.03.6"
+referencing = ">=0.28.4"
+rpds-py = ">=0.7.1"
+
+[package.extras]
+format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"]
+format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "rfc3987-syntax (>=1.1.0)", "uri-template", "webcolors (>=24.6.0)"]
+
+[[package]]
+name = "jsonschema-specifications"
+version = "2025.9.1"
+description = "The JSON Schema meta-schemas and vocabularies, exposed as a Registry"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe"},
+    {file = "jsonschema_specifications-2025.9.1.tar.gz", hash = "sha256:b540987f239e745613c7a9176f3edb72b832a4ac465cf02712288397832b5e8d"},
+]
+
+[package.dependencies]
+referencing = ">=0.31.0"
 
 [[package]]
 name = "langchain"
@@ -621,8 +673,6 @@ version = "0.3.27"
 description = "Building applications with LLMs through composability"
 optional = true
 python-versions = "<4.0,>=3.9"
-groups = ["main"]
-markers = "extra == \"langchain\""
 files = [
     {file = "langchain-0.3.27-py3-none-any.whl", hash = "sha256:7b20c4f338826acb148d885b20a73a16e410ede9ee4f19bb02011852d5f98798"},
     {file = "langchain-0.3.27.tar.gz", hash = "sha256:aa6f1e6274ff055d0fd36254176770f356ed0a8994297d1df47df341953cec62"},
@@ -663,12 +713,10 @@ version = "0.3.75"
 description = "Building applications with LLMs through composability"
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "dev"]
 files = [
     {file = "langchain_core-0.3.75-py3-none-any.whl", hash = "sha256:03ca1fadf955ee3c7d5806a841f4b3a37b816acea5e61a7e6ba1298c05eea7f5"},
     {file = "langchain_core-0.3.75.tar.gz", hash = "sha256:ab0eb95a06ed6043f76162e6086b45037690cb70b7f090bd83b5ebb8a05b70ed"},
 ]
-markers = {main = "extra == \"langchain\""}
 
 [package.dependencies]
 jsonpatch = ">=1.33,<2.0"
@@ -685,7 +733,6 @@ version = "0.3.32"
 description = "An integration package connecting OpenAI and LangChain"
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
 files = [
     {file = "langchain_openai-0.3.32-py3-none-any.whl", hash = "sha256:3354f76822f7cc76d8069831fe2a77f9bc7ff3b4f13af788bd94e4c6e853b400"},
     {file = "langchain_openai-0.3.32.tar.gz", hash = "sha256:782ad669bd1bdb964456d8882c5178717adcfceecb482cc20005f770e43d346d"},
@@ -702,8 +749,6 @@ version = "0.3.9"
 description = "LangChain text splitting utilities"
 optional = true
 python-versions = ">=3.9"
-groups = ["main"]
-markers = "extra == \"langchain\""
 files = [
     {file = "langchain_text_splitters-0.3.9-py3-none-any.whl", hash = "sha256:cee0bb816211584ea79cc79927317c358543f40404bcfdd69e69ba3ccde54401"},
     {file = "langchain_text_splitters-0.3.9.tar.gz", hash = "sha256:7cd1e5a3aaf609979583eeca2eb34177622570b8fa8f586a605c6b1c34e7ebdb"},
@@ -718,7 +763,6 @@ version = "0.6.6"
 description = "Building stateful, multi-actor applications with LLMs"
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
 files = [
     {file = "langgraph-0.6.6-py3-none-any.whl", hash = "sha256:a2283a5236abba6c8307c1a485c04e8a0f0ffa2be770878782a7bf2deb8d7954"},
     {file = "langgraph-0.6.6.tar.gz", hash = "sha256:e7d3cefacf356f8c01721b166b67b3bf581659d5361a3530f59ecd9b8448eca7"},
@@ -738,7 +782,6 @@ version = "2.1.1"
 description = "Library with base interfaces for LangGraph checkpoint savers."
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
 files = [
     {file = "langgraph_checkpoint-2.1.1-py3-none-any.whl", hash = "sha256:5a779134fd28134a9a83d078be4450bbf0e0c79fdf5e992549658899e6fc5ea7"},
     {file = "langgraph_checkpoint-2.1.1.tar.gz", hash = "sha256:72038c0f9e22260cb9bff1f3ebe5eb06d940b7ee5c1e4765019269d4f21cf92d"},
@@ -754,7 +797,6 @@ version = "0.6.4"
 description = "Library with high-level APIs for creating and executing LangGraph agents and tools."
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
 files = [
     {file = "langgraph_prebuilt-0.6.4-py3-none-any.whl", hash = "sha256:819f31d88b84cb2729ff1b79db2d51e9506b8fb7aaacfc0d359d4fe16e717344"},
     {file = "langgraph_prebuilt-0.6.4.tar.gz", hash = "sha256:e9e53b906ee5df46541d1dc5303239e815d3ec551e52bb03dd6463acc79ec28f"},
@@ -770,7 +812,6 @@ version = "0.2.3"
 description = "SDK for interacting with LangGraph API"
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
 files = [
     {file = "langgraph_sdk-0.2.3-py3-none-any.whl", hash = "sha256:059edfe2f62708c2e54239e170f5a33f796d456dbdbde64276c16cac8b97ba99"},
     {file = "langgraph_sdk-0.2.3.tar.gz", hash = "sha256:17398aeae0f937cae1c8eb9027ada2969abdb50fe8ed3246c78f543b679cf959"},
@@ -786,12 +827,10 @@ version = "0.4.19"
 description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "dev"]
 files = [
     {file = "langsmith-0.4.19-py3-none-any.whl", hash = "sha256:4c50ae47e9f8430a06adb54bceaf32808f5e54fcb8186731bf7b2dab3fc30621"},
     {file = "langsmith-0.4.19.tar.gz", hash = "sha256:71916bef574f72c40887ce371a4502d80c80efc2a053df123f1347e79ea83dca"},
 ]
-markers = {main = "extra == \"langchain\""}
 
 [package.dependencies]
 httpx = ">=0.23.0,<1"
@@ -815,7 +854,6 @@ version = "3.0.2"
 description = "Safely add untrusted strings to HTML/XML markup."
 optional = false
 python-versions = ">=3.9"
-groups = ["dev", "docs"]
 files = [
     {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8"},
     {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158"},
@@ -886,7 +924,6 @@ version = "1.17.1"
 description = "Optional static typing for Python"
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
 files = [
     {file = "mypy-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3fbe6d5555bf608c47203baa3e72dbc6ec9965b3d7c318aa9a4ca76f465bd972"},
     {file = "mypy-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:80ef5c058b7bce08c83cac668158cb7edea692e458d21098c7d3bce35a5d43e7"},
@@ -947,7 +984,6 @@ version = "1.1.0"
 description = "Type system extensions for programs checked with the mypy type checker."
 optional = false
 python-versions = ">=3.8"
-groups = ["dev"]
 files = [
     {file = "mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505"},
     {file = "mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558"},
@@ -959,7 +995,6 @@ version = "1.9.1"
 description = "Node.js virtual environment builder"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
-groups = ["dev"]
 files = [
     {file = "nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9"},
     {file = "nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f"},
@@ -971,12 +1006,10 @@ version = "1.102.0"
 description = "The official Python library for the openai API"
 optional = false
 python-versions = ">=3.8"
-groups = ["main", "dev"]
 files = [
     {file = "openai-1.102.0-py3-none-any.whl", hash = "sha256:d751a7e95e222b5325306362ad02a7aa96e1fab3ed05b5888ce1c7ca63451345"},
     {file = "openai-1.102.0.tar.gz", hash = "sha256:2e0153bcd64a6523071e90211cbfca1f2bbc5ceedd0993ba932a5869f93b7fc9"},
 ]
-markers = {main = "extra == \"openai\""}
 
 [package.dependencies]
 anyio = ">=3.5.0,<5"
@@ -1000,7 +1033,6 @@ version = "1.36.0"
 description = "OpenTelemetry Python API"
 optional = false
 python-versions = ">=3.9"
-groups = ["main"]
 files = [
     {file = "opentelemetry_api-1.36.0-py3-none-any.whl", hash = "sha256:02f20bcacf666e1333b6b1f04e647dc1d5111f86b8e510238fcc56d7762cda8c"},
     {file = "opentelemetry_api-1.36.0.tar.gz", hash = "sha256:9a72572b9c416d004d492cbc6e61962c0501eaf945ece9b5a0f56597d8348aa0"},
@@ -1016,7 +1048,6 @@ version = "1.36.0"
 description = "OpenTelemetry Protobuf encoding"
 optional = false
 python-versions = ">=3.9"
-groups = ["main"]
 files = [
     {file = "opentelemetry_exporter_otlp_proto_common-1.36.0-py3-none-any.whl", hash = "sha256:0fc002a6ed63eac235ada9aa7056e5492e9a71728214a61745f6ad04b923f840"},
     {file = "opentelemetry_exporter_otlp_proto_common-1.36.0.tar.gz", hash = "sha256:6c496ccbcbe26b04653cecadd92f73659b814c6e3579af157d8716e5f9f25cbf"},
@@ -1031,7 +1062,6 @@ version = "1.36.0"
 description = "OpenTelemetry Collector Protobuf over HTTP Exporter"
 optional = false
 python-versions = ">=3.9"
-groups = ["main"]
 files = [
     {file = "opentelemetry_exporter_otlp_proto_http-1.36.0-py3-none-any.whl", hash = "sha256:3d769f68e2267e7abe4527f70deb6f598f40be3ea34c6adc35789bea94a32902"},
     {file = "opentelemetry_exporter_otlp_proto_http-1.36.0.tar.gz", hash = "sha256:dd3637f72f774b9fc9608ab1ac479f8b44d09b6fb5b2f3df68a24ad1da7d356e"},
@@ -1052,7 +1082,6 @@ version = "1.36.0"
 description = "OpenTelemetry Python Proto"
 optional = false
 python-versions = ">=3.9"
-groups = ["main"]
 files = [
     {file = "opentelemetry_proto-1.36.0-py3-none-any.whl", hash = "sha256:151b3bf73a09f94afc658497cf77d45a565606f62ce0c17acb08cd9937ca206e"},
     {file = "opentelemetry_proto-1.36.0.tar.gz", hash = "sha256:0f10b3c72f74c91e0764a5ec88fd8f1c368ea5d9c64639fb455e2854ef87dd2f"},
@@ -1067,7 +1096,6 @@ version = "1.36.0"
 description = "OpenTelemetry Python SDK"
 optional = false
 python-versions = ">=3.9"
-groups = ["main"]
 files = [
     {file = "opentelemetry_sdk-1.36.0-py3-none-any.whl", hash = "sha256:19fe048b42e98c5c1ffe85b569b7073576ad4ce0bcb6e9b4c6a39e890a6c45fb"},
     {file = "opentelemetry_sdk-1.36.0.tar.gz", hash = "sha256:19c8c81599f51b71670661ff7495c905d8fdf6976e41622d5245b791b06fa581"},
@@ -1084,7 +1112,6 @@ version = "0.57b0"
 description = "OpenTelemetry Semantic Conventions"
 optional = false
 python-versions = ">=3.9"
-groups = ["main"]
 files = [
     {file = "opentelemetry_semantic_conventions-0.57b0-py3-none-any.whl", hash = "sha256:757f7e76293294f124c827e514c2a3144f191ef175b069ce8d1211e1e38e9e78"},
     {file = "opentelemetry_semantic_conventions-0.57b0.tar.gz", hash = "sha256:609a4a79c7891b4620d64c7aac6898f872d790d75f22019913a660756f27ff32"},
@@ -1100,7 +1127,6 @@ version = "3.11.3"
 description = "Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy"
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "dev"]
 files = [
     {file = "orjson-3.11.3-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:29cb1f1b008d936803e2da3d7cba726fc47232c45df531b29edf0b232dd737e7"},
     {file = "orjson-3.11.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:97dceed87ed9139884a55db8722428e27bd8452817fbf1869c58b49fecab1120"},
@@ -1186,7 +1212,6 @@ files = [
     {file = "orjson-3.11.3-cp39-cp39-win_amd64.whl", hash = "sha256:215c595c792a87d4407cb72dd5e0f6ee8e694ceeb7f9102b533c5a9bf2a916bb"},
     {file = "orjson-3.11.3.tar.gz", hash = "sha256:1c0603b1d2ffcd43a411d64797a19556ef76958aef1c182f22dc30860152a98a"},
 ]
-markers = {main = "extra == \"langchain\" and platform_python_implementation != \"PyPy\""}
 
 [[package]]
 name = "ormsgpack"
@@ -1194,7 +1219,6 @@ version = "1.10.0"
 description = "Fast, correct Python msgpack library supporting dataclasses, datetimes, and numpy"
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
 files = [
     {file = "ormsgpack-1.10.0-cp310-cp310-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:8a52c7ce7659459f3dc8dec9fd6a6c76f855a0a7e2b61f26090982ac10b95216"},
     {file = "ormsgpack-1.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:060f67fe927582f4f63a1260726d019204b72f460cf20930e6c925a1d129f373"},
@@ -1245,7 +1269,6 @@ version = "25.0"
 description = "Core utilities for Python packages"
 optional = false
 python-versions = ">=3.8"
-groups = ["main", "dev"]
 files = [
     {file = "packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484"},
     {file = "packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f"},
@@ -1257,7 +1280,6 @@ version = "0.12.1"
 description = "Utility library for gitignore style pattern matching of file paths."
 optional = false
 python-versions = ">=3.8"
-groups = ["dev"]
 files = [
     {file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"},
     {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"},
@@ -1269,7 +1291,6 @@ version = "15.0.4"
 description = "API Documentation for Python Projects"
 optional = false
 python-versions = ">=3.9"
-groups = ["docs"]
 files = [
     {file = "pdoc-15.0.4-py3-none-any.whl", hash = "sha256:f9028e85e7bb8475b054e69bde1f6d26fc4693d25d9fa1b1ce9009bec7f7a5c4"},
     {file = "pdoc-15.0.4.tar.gz", hash = "sha256:cf9680f10f5b4863381f44ef084b1903f8f356acb0d4cc6b64576ba9fb712c82"},
@@ -1286,7 +1307,6 @@ version = "4.4.0"
 description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`."
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
 files = [
     {file = "platformdirs-4.4.0-py3-none-any.whl", hash = "sha256:abd01743f24e5287cd7a5db3752faf1a2d65353f38ec26d98e25a6db65958c85"},
     {file = "platformdirs-4.4.0.tar.gz", hash = "sha256:ca753cf4d81dc309bc67b0ea38fd15dc97bc30ce419a7f58d13eb3bf14c4febf"},
@@ -1303,7 +1323,6 @@ version = "1.6.0"
 description = "plugin and hook calling mechanisms for python"
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
 files = [
     {file = "pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746"},
     {file = "pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3"},
@@ -1313,13 +1332,76 @@ files = [
 dev = ["pre-commit", "tox"]
 testing = ["coverage", "pytest", "pytest-benchmark"]
 
+[[package]]
+name = "polyleven"
+version = "0.9.0"
+description = "A fast C-implemented library for Levenshtein distance"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "polyleven-0.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6e00207fbe0fcdde206b9b277cf14bb9db8801f8d303204b1572870797399974"},
+    {file = "polyleven-0.9.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d400f255af038f77b37d5010532e0e82d07160457c8282e5b40632987ab815be"},
+    {file = "polyleven-0.9.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a1d3f1b385e9f51090beca54925a0fd0ab2d744fcea91dd9353c7b13bbb274f"},
+    {file = "polyleven-0.9.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:2be92bb7743e3b3e14a2b894902f4ceeea5700849dd9e9ab59c68bd7943b3d85"},
+    {file = "polyleven-0.9.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7bd784bad5164d0d4e823d98aa8ffdc118c14d211dfd7271ede7f1baa7efc691"},
+    {file = "polyleven-0.9.0-cp310-cp310-win32.whl", hash = "sha256:bac610f5a30b56ab2fbb1a3de071ef9ed3aa6a572a80a4cfbf0665929e0f6451"},
+    {file = "polyleven-0.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:4e4ab3cfc196907751adb3b65959ad8be08fc06679d071fdf01e5225f394812e"},
+    {file = "polyleven-0.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e58bbcd3f062043fa67e76e89f803eb308ea06fbb4dc6f32d7063c37f1c16dfd"},
+    {file = "polyleven-0.9.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3fd803de02e99f51ade3fcae4e5be50c89c1ff360213bcdbcf98820e2633c71a"},
+    {file = "polyleven-0.9.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff60e2da0864b3d4bec2826eadbbb0a8967384d53bec9e693aad7b0089e1258c"},
+    {file = "polyleven-0.9.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:259856641423ca82230237d637869301ba02971c24283101b67c8117e7116b7a"},
+    {file = "polyleven-0.9.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a46e7b364b3936f025022d1182e10cba9ac45974dc2cafa17b7f9f515784adb5"},
+    {file = "polyleven-0.9.0-cp311-cp311-win32.whl", hash = "sha256:6f0fd999efaa0d5409603ae7e44b60152b8d12a190b54115bcf0ba93e41e09f1"},
+    {file = "polyleven-0.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:65a6e899db184bce6384526e46f446c6c159a2b0bb3b463dcc78a2bc8ddf85f5"},
+    {file = "polyleven-0.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1b9c905fa0862c1f3e27e948a713fb86a26ce1659f1d90b1b4aff04a8890213b"},
+    {file = "polyleven-0.9.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7058bea0da4893ebb8bedd9f638ec4e026c150e29b7b7385db5c157742d0ff11"},
+    {file = "polyleven-0.9.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b99fcfc48c1eaacc4a46dd9d22dc98de111120c66b56df14257f276b762bd591"},
+    {file = "polyleven-0.9.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:29ef7db85a7bb01be9372461bc8d8993d4817dfcea702e4d2b8f0d9c43415ebe"},
+    {file = "polyleven-0.9.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:288bfe0a0040421c52a5dc312b55c47812a72fb9cd7e6d19859ac2f9f11f350f"},
+    {file = "polyleven-0.9.0-cp312-cp312-win32.whl", hash = "sha256:7260fa32fff7194e06b4221e0a6d2ba2decd4e4dc51f7f8cddbf365649326ee4"},
+    {file = "polyleven-0.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:4db8b16aac237dbf644a0e4323c3ba0907dab6adecd2a345bf2fa92301d7fb2d"},
+    {file = "polyleven-0.9.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:45cea2885c61bda9711244a51aed068f9a55f1d776d4caad6c574a3f401945ae"},
+    {file = "polyleven-0.9.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62b039e9dc8fa53ad740de02d168a7e9d0edce3734b2927f40fe851b328b766f"},
+    {file = "polyleven-0.9.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0a0c1ecd2dc356fd94edc80e18a30ad28e93ccc840127e765b83ad60426b2d5"},
+    {file = "polyleven-0.9.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:20576da0c8000bd1c4a07cee43db9169b7d094f5dcc03b20775506d07c56f4fb"},
+    {file = "polyleven-0.9.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ba356ce9e7e7e8ddf4eff17eb39df5b822cb8899450c6d289a22249b78c9a5f4"},
+    {file = "polyleven-0.9.0-cp313-cp313-win32.whl", hash = "sha256:244d759986486252121061d727a642d3505cbdd9e6616467b42935e662a9fa61"},
+    {file = "polyleven-0.9.0-cp313-cp313-win_amd64.whl", hash = "sha256:8f671df664924b3ec14195be7bf778d5f71811989e59a3f9547f8066cefc596f"},
+    {file = "polyleven-0.9.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7309296f1f91e7aa7d292e5b9aa0da53f2ce7997cfda8535155424a791fe73c8"},
+    {file = "polyleven-0.9.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:50c71e238153acdf010c7fe6f18835dd6d7ca37a7e7cca08d51c2234e2227019"},
+    {file = "polyleven-0.9.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ecf0a858b7694acea0f7459f8699f8b1f62ee99d88529b01f3a1597aa4c53978"},
+    {file = "polyleven-0.9.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:c903c9b70a089c5f2b5990ce3a09ac1ce39d0b1ea93ec8c9e1eb217ddea779c6"},
+    {file = "polyleven-0.9.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:e9608f5835f8fb3778aaad2b126aaea201cd9a6b210286533762c29cd3debcf2"},
+    {file = "polyleven-0.9.0-cp38-cp38-win32.whl", hash = "sha256:aabd963fef557f6afe4306920cbd6c580aff572c8a96c5d6bf572fb9c4bdce46"},
+    {file = "polyleven-0.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:e8c4c3c6515f4753fe69becb4686009bc5a5776752fd27a3d34d89f54f8c40e6"},
+    {file = "polyleven-0.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c672c982108a48c7aebd7016aa8482b8ee96f01280a68cbee56293055aebdfc7"},
+    {file = "polyleven-0.9.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1a4f857c9f7fd99b7e41305e6cdb30d39592b1a6ca50fbc20edd175746e376ca"},
+    {file = "polyleven-0.9.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26e06e1da0734c8d5a1625589d2bd213f9d40d0023370475c167dc773239ab78"},
+    {file = "polyleven-0.9.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:9859199fefc85329b495cd0ce5b34df1a9acf6623d3dbaff5fcb688ade59fb88"},
+    {file = "polyleven-0.9.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:58703ae7483b46a5e05d2d3f2cac2e345b96b57faaebfe09c5890eb5346daf31"},
+    {file = "polyleven-0.9.0-cp39-cp39-win32.whl", hash = "sha256:92a0d2e4d6230f2ccc14d12d11cb496d5d5b81d975841bfed9dce6d11cf90826"},
+    {file = "polyleven-0.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:1d651a6714caf4d144f8cb0bd6b1eb043a2ca80dd7c6d87b8f8020edc1729149"},
+    {file = "polyleven-0.9.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:0a59f3cf5297e22aac73cf439e1e9cb0703af1adc853fb911637172db09bddec"},
+    {file = "polyleven-0.9.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3c8581d8eae56d0e0e3cce33384b4365ef29a924f48edc6b3b5a694412c4b7d"},
+    {file = "polyleven-0.9.0-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:603f0ea18dc0826f7078c14484c227dcdb61ca8e4485d0b67f2df317a3a01726"},
+    {file = "polyleven-0.9.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:8cf8ff07ea44947e9a34ab371a3b0fec4d2328957332185445cfdd1675539cb9"},
+    {file = "polyleven-0.9.0-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:cf4fb8f5be74b9bf7e6f7c2014ee153dc4208af337b781cf3aafc5f51a647d80"},
+    {file = "polyleven-0.9.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f21e6c050f6f0d259cf9c6367042ba6a69e553b8294143c83bb47f6481486f9c"},
+    {file = "polyleven-0.9.0-pp38-pypy38_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c74d8cba499541fe96e96a76cb8ac2bac7f3d7efeb8c2cec1bf1383c91790f4"},
+    {file = "polyleven-0.9.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:5260411e820a858728d32f161690a54bc2162644dba8f4e2b0dd72707d00ac20"},
+    {file = "polyleven-0.9.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:81ae9a154c82d53ff67d6cd6b4ee96de3e449f2c8cccd49aaa62b50f6e57a4eb"},
+    {file = "polyleven-0.9.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ef398fe2759f84a6c088320742f09ecef5904e5c1f60668eed08f431221c5239"},
+    {file = "polyleven-0.9.0-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a3163f6c7ad192ee14ef760b1dd3143a3107c483a327dcfb5e6c94d4c8217fa4"},
+    {file = "polyleven-0.9.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:87ef064bfe4a1b13414e440f56a716096375ec93cf1351bed9a84942c230c715"},
+    {file = "polyleven-0.9.0.tar.gz", hash = "sha256:299a93766761b5e5fb4092388f3dc6401224fd436c05f11c4ee48b262587e8da"},
+]
+
 [[package]]
 name = "pre-commit"
 version = "3.8.0"
 description = "A framework for managing and maintaining multi-language pre-commit hooks."
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
 files = [
     {file = "pre_commit-3.8.0-py2.py3-none-any.whl", hash = "sha256:9a90a53bf82fdd8778d58085faf8d83df56e40dfe18f45b19446e26bf1b3a63f"},
     {file = "pre_commit-3.8.0.tar.gz", hash = "sha256:8bb6494d4a20423842e198980c9ecf9f96607a07ea29549e180eef9ae80fe7af"},
@@ -1338,7 +1420,6 @@ version = "6.32.0"
 description = ""
 optional = false
 python-versions = ">=3.9"
-groups = ["main"]
 files = [
     {file = "protobuf-6.32.0-cp310-abi3-win32.whl", hash = "sha256:84f9e3c1ff6fb0308dbacb0950d8aa90694b0d0ee68e75719cb044b7078fe741"},
     {file = "protobuf-6.32.0-cp310-abi3-win_amd64.whl", hash = "sha256:a8bdbb2f009cfc22a36d031f22a625a38b615b5e19e558a7b756b3279723e68e"},
@@ -1357,7 +1438,6 @@ version = "2.11.7"
 description = "Data validation using Python type hints"
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "dev"]
 files = [
     {file = "pydantic-2.11.7-py3-none-any.whl", hash = "sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b"},
     {file = "pydantic-2.11.7.tar.gz", hash = "sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db"},
@@ -1371,7 +1451,7 @@ typing-inspection = ">=0.4.0"
 
 [package.extras]
 email = ["email-validator (>=2.0.0)"]
-timezone = ["tzdata ; python_version >= \"3.9\" and platform_system == \"Windows\""]
+timezone = ["tzdata"]
 
 [[package]]
 name = "pydantic-core"
@@ -1379,7 +1459,6 @@ version = "2.33.2"
 description = "Core functionality for Pydantic validation and serialization"
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "dev"]
 files = [
     {file = "pydantic_core-2.33.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8"},
     {file = "pydantic_core-2.33.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d"},
@@ -1491,7 +1570,6 @@ version = "2.19.2"
 description = "Pygments is a syntax highlighting package written in Python."
 optional = false
 python-versions = ">=3.8"
-groups = ["dev", "docs"]
 files = [
     {file = "pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b"},
     {file = "pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887"},
@@ -1506,7 +1584,6 @@ version = "8.4.1"
 description = "pytest: simple powerful testing with Python"
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
 files = [
     {file = "pytest-8.4.1-py3-none-any.whl", hash = "sha256:539c70ba6fcead8e78eebbf1115e8b589e7565830d7d006a8723f19ac8a0afb7"},
     {file = "pytest-8.4.1.tar.gz", hash = "sha256:7c67fd69174877359ed9371ec3af8a3d2b04741818c51e5e99cc1742251fa93c"},
@@ -1530,7 +1607,6 @@ version = "1.1.0"
 description = "Pytest support for asyncio"
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
 files = [
     {file = "pytest_asyncio-1.1.0-py3-none-any.whl", hash = "sha256:5fe2d69607b0bd75c656d1211f969cadba035030156745ee09e7d71740e58ecf"},
     {file = "pytest_asyncio-1.1.0.tar.gz", hash = "sha256:796aa822981e01b68c12e4827b8697108f7205020f24b5793b3c41555dab68ea"},
@@ -1551,7 +1627,6 @@ version = "1.1.3"
 description = "pytest-httpserver is a httpserver for pytest"
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
 files = [
     {file = "pytest_httpserver-1.1.3-py3-none-any.whl", hash = "sha256:5f84757810233e19e2bb5287f3826a71c97a3740abe3a363af9155c0f82fdbb9"},
     {file = "pytest_httpserver-1.1.3.tar.gz", hash = "sha256:af819d6b533f84b4680b9416a5b3f67f1df3701f1da54924afd4d6e4ba5917ec"},
@@ -1566,7 +1641,6 @@ version = "2.4.0"
 description = "pytest plugin to abort hanging tests"
 optional = false
 python-versions = ">=3.7"
-groups = ["dev"]
 files = [
     {file = "pytest_timeout-2.4.0-py3-none-any.whl", hash = "sha256:c42667e5cdadb151aeb5b26d114aff6bdf5a907f176a007a30b940d3d865b5c2"},
     {file = "pytest_timeout-2.4.0.tar.gz", hash = "sha256:7e68e90b01f9eff71332b25001f85c75495fc4e3a836701876183c4bcfd0540a"},
@@ -1581,7 +1655,6 @@ version = "3.8.0"
 description = "pytest xdist plugin for distributed testing, most importantly across multiple CPUs"
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
 files = [
     {file = "pytest_xdist-3.8.0-py3-none-any.whl", hash = "sha256:202ca578cfeb7370784a8c33d6d05bc6e13b4f25b5053c30a152269fd10f0b88"},
     {file = "pytest_xdist-3.8.0.tar.gz", hash = "sha256:7e578125ec9bc6050861aa93f2d59f1d8d085595d6551c2c90b6f4fad8d3a9f1"},
@@ -1602,7 +1675,6 @@ version = "6.0.2"
 description = "YAML parser and emitter for Python"
 optional = false
 python-versions = ">=3.8"
-groups = ["main", "dev"]
 files = [
     {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
     {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
@@ -1658,7 +1730,22 @@ files = [
     {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"},
     {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"},
 ]
-markers = {main = "extra == \"langchain\""}
+
+[[package]]
+name = "referencing"
+version = "0.36.2"
+description = "JSON Referencing + Python"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "referencing-0.36.2-py3-none-any.whl", hash = "sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0"},
+    {file = "referencing-0.36.2.tar.gz", hash = "sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa"},
+]
+
+[package.dependencies]
+attrs = ">=22.2.0"
+rpds-py = ">=0.7.0"
+typing-extensions = {version = ">=4.4.0", markers = "python_version < \"3.13\""}
 
 [[package]]
 name = "regex"
@@ -1666,7 +1753,6 @@ version = "2025.7.34"
 description = "Alternative regular expression module, to replace re."
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
 files = [
     {file = "regex-2025.7.34-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d856164d25e2b3b07b779bfed813eb4b6b6ce73c2fd818d46f47c1eb5cd79bd6"},
     {file = "regex-2025.7.34-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2d15a9da5fad793e35fb7be74eec450d968e05d2e294f3e0e77ab03fa7234a83"},
@@ -1763,7 +1849,6 @@ version = "2.32.5"
 description = "Python HTTP for Humans."
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "dev"]
 files = [
     {file = "requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6"},
     {file = "requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf"},
@@ -1785,23 +1870,184 @@ version = "1.0.0"
 description = "A utility belt for advanced users of python-requests"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
-groups = ["main", "dev"]
 files = [
     {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"},
     {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"},
 ]
-markers = {main = "extra == \"langchain\""}
 
 [package.dependencies]
 requests = ">=2.0.1,<3.0.0"
 
+[[package]]
+name = "rpds-py"
+version = "0.27.1"
+description = "Python bindings to Rust's persistent data structures (rpds)"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "rpds_py-0.27.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:68afeec26d42ab3b47e541b272166a0b4400313946871cba3ed3a4fc0cab1cef"},
+    {file = "rpds_py-0.27.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:74e5b2f7bb6fa38b1b10546d27acbacf2a022a8b5543efb06cfebc72a59c85be"},
+    {file = "rpds_py-0.27.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9024de74731df54546fab0bfbcdb49fae19159ecaecfc8f37c18d2c7e2c0bd61"},
+    {file = "rpds_py-0.27.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:31d3ebadefcd73b73928ed0b2fd696f7fefda8629229f81929ac9c1854d0cffb"},
+    {file = "rpds_py-0.27.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b2e7f8f169d775dd9092a1743768d771f1d1300453ddfe6325ae3ab5332b4657"},
+    {file = "rpds_py-0.27.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d905d16f77eb6ab2e324e09bfa277b4c8e5e6b8a78a3e7ff8f3cdf773b4c013"},
+    {file = "rpds_py-0.27.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50c946f048209e6362e22576baea09193809f87687a95a8db24e5fbdb307b93a"},
+    {file = "rpds_py-0.27.1-cp310-cp310-manylinux_2_31_riscv64.whl", hash = "sha256:3deab27804d65cd8289eb814c2c0e807c4b9d9916c9225e363cb0cf875eb67c1"},
+    {file = "rpds_py-0.27.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8b61097f7488de4be8244c89915da8ed212832ccf1e7c7753a25a394bf9b1f10"},
+    {file = "rpds_py-0.27.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8a3f29aba6e2d7d90528d3c792555a93497fe6538aa65eb675b44505be747808"},
+    {file = "rpds_py-0.27.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:dd6cd0485b7d347304067153a6dc1d73f7d4fd995a396ef32a24d24b8ac63ac8"},
+    {file = "rpds_py-0.27.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:6f4461bf931108c9fa226ffb0e257c1b18dc2d44cd72b125bec50ee0ab1248a9"},
+    {file = "rpds_py-0.27.1-cp310-cp310-win32.whl", hash = "sha256:ee5422d7fb21f6a00c1901bf6559c49fee13a5159d0288320737bbf6585bd3e4"},
+    {file = "rpds_py-0.27.1-cp310-cp310-win_amd64.whl", hash = "sha256:3e039aabf6d5f83c745d5f9a0a381d031e9ed871967c0a5c38d201aca41f3ba1"},
+    {file = "rpds_py-0.27.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:be898f271f851f68b318872ce6ebebbc62f303b654e43bf72683dbdc25b7c881"},
+    {file = "rpds_py-0.27.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:62ac3d4e3e07b58ee0ddecd71d6ce3b1637de2d373501412df395a0ec5f9beb5"},
+    {file = "rpds_py-0.27.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4708c5c0ceb2d034f9991623631d3d23cb16e65c83736ea020cdbe28d57c0a0e"},
+    {file = "rpds_py-0.27.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:abfa1171a9952d2e0002aba2ad3780820b00cc3d9c98c6630f2e93271501f66c"},
+    {file = "rpds_py-0.27.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4b507d19f817ebaca79574b16eb2ae412e5c0835542c93fe9983f1e432aca195"},
+    {file = "rpds_py-0.27.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:168b025f8fd8d8d10957405f3fdcef3dc20f5982d398f90851f4abc58c566c52"},
+    {file = "rpds_py-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb56c6210ef77caa58e16e8c17d35c63fe3f5b60fd9ba9d424470c3400bcf9ed"},
+    {file = "rpds_py-0.27.1-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:d252f2d8ca0195faa707f8eb9368955760880b2b42a8ee16d382bf5dd807f89a"},
+    {file = "rpds_py-0.27.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6e5e54da1e74b91dbc7996b56640f79b195d5925c2b78efaa8c5d53e1d88edde"},
+    {file = "rpds_py-0.27.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ffce0481cc6e95e5b3f0a47ee17ffbd234399e6d532f394c8dce320c3b089c21"},
+    {file = "rpds_py-0.27.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:a205fdfe55c90c2cd8e540ca9ceba65cbe6629b443bc05db1f590a3db8189ff9"},
+    {file = "rpds_py-0.27.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:689fb5200a749db0415b092972e8eba85847c23885c8543a8b0f5c009b1a5948"},
+    {file = "rpds_py-0.27.1-cp311-cp311-win32.whl", hash = "sha256:3182af66048c00a075010bc7f4860f33913528a4b6fc09094a6e7598e462fe39"},
+    {file = "rpds_py-0.27.1-cp311-cp311-win_amd64.whl", hash = "sha256:b4938466c6b257b2f5c4ff98acd8128ec36b5059e5c8f8372d79316b1c36bb15"},
+    {file = "rpds_py-0.27.1-cp311-cp311-win_arm64.whl", hash = "sha256:2f57af9b4d0793e53266ee4325535a31ba48e2f875da81a9177c9926dfa60746"},
+    {file = "rpds_py-0.27.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:ae2775c1973e3c30316892737b91f9283f9908e3cc7625b9331271eaaed7dc90"},
+    {file = "rpds_py-0.27.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2643400120f55c8a96f7c9d858f7be0c88d383cd4653ae2cf0d0c88f668073e5"},
+    {file = "rpds_py-0.27.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:16323f674c089b0360674a4abd28d5042947d54ba620f72514d69be4ff64845e"},
+    {file = "rpds_py-0.27.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9a1f4814b65eacac94a00fc9a526e3fdafd78e439469644032032d0d63de4881"},
+    {file = "rpds_py-0.27.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7ba32c16b064267b22f1850a34051121d423b6f7338a12b9459550eb2096e7ec"},
+    {file = "rpds_py-0.27.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5c20f33fd10485b80f65e800bbe5f6785af510b9f4056c5a3c612ebc83ba6cb"},
+    {file = "rpds_py-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:466bfe65bd932da36ff279ddd92de56b042f2266d752719beb97b08526268ec5"},
+    {file = "rpds_py-0.27.1-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:41e532bbdcb57c92ba3be62c42e9f096431b4cf478da9bc3bc6ce5c38ab7ba7a"},
+    {file = "rpds_py-0.27.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f149826d742b406579466283769a8ea448eed82a789af0ed17b0cd5770433444"},
+    {file = "rpds_py-0.27.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:80c60cfb5310677bd67cb1e85a1e8eb52e12529545441b43e6f14d90b878775a"},
+    {file = "rpds_py-0.27.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:7ee6521b9baf06085f62ba9c7a3e5becffbc32480d2f1b351559c001c38ce4c1"},
+    {file = "rpds_py-0.27.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a512c8263249a9d68cac08b05dd59d2b3f2061d99b322813cbcc14c3c7421998"},
+    {file = "rpds_py-0.27.1-cp312-cp312-win32.whl", hash = "sha256:819064fa048ba01b6dadc5116f3ac48610435ac9a0058bbde98e569f9e785c39"},
+    {file = "rpds_py-0.27.1-cp312-cp312-win_amd64.whl", hash = "sha256:d9199717881f13c32c4046a15f024971a3b78ad4ea029e8da6b86e5aa9cf4594"},
+    {file = "rpds_py-0.27.1-cp312-cp312-win_arm64.whl", hash = "sha256:33aa65b97826a0e885ef6e278fbd934e98cdcfed80b63946025f01e2f5b29502"},
+    {file = "rpds_py-0.27.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:e4b9fcfbc021633863a37e92571d6f91851fa656f0180246e84cbd8b3f6b329b"},
+    {file = "rpds_py-0.27.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1441811a96eadca93c517d08df75de45e5ffe68aa3089924f963c782c4b898cf"},
+    {file = "rpds_py-0.27.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:55266dafa22e672f5a4f65019015f90336ed31c6383bd53f5e7826d21a0e0b83"},
+    {file = "rpds_py-0.27.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d78827d7ac08627ea2c8e02c9e5b41180ea5ea1f747e9db0915e3adf36b62dcf"},
+    {file = "rpds_py-0.27.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae92443798a40a92dc5f0b01d8a7c93adde0c4dc965310a29ae7c64d72b9fad2"},
+    {file = "rpds_py-0.27.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c46c9dd2403b66a2a3b9720ec4b74d4ab49d4fabf9f03dfdce2d42af913fe8d0"},
+    {file = "rpds_py-0.27.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2efe4eb1d01b7f5f1939f4ef30ecea6c6b3521eec451fb93191bf84b2a522418"},
+    {file = "rpds_py-0.27.1-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:15d3b4d83582d10c601f481eca29c3f138d44c92187d197aff663a269197c02d"},
+    {file = "rpds_py-0.27.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4ed2e16abbc982a169d30d1a420274a709949e2cbdef119fe2ec9d870b42f274"},
+    {file = "rpds_py-0.27.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a75f305c9b013289121ec0f1181931975df78738cdf650093e6b86d74aa7d8dd"},
+    {file = "rpds_py-0.27.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:67ce7620704745881a3d4b0ada80ab4d99df390838839921f99e63c474f82cf2"},
+    {file = "rpds_py-0.27.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9d992ac10eb86d9b6f369647b6a3f412fc0075cfd5d799530e84d335e440a002"},
+    {file = "rpds_py-0.27.1-cp313-cp313-win32.whl", hash = "sha256:4f75e4bd8ab8db624e02c8e2fc4063021b58becdbe6df793a8111d9343aec1e3"},
+    {file = "rpds_py-0.27.1-cp313-cp313-win_amd64.whl", hash = "sha256:f9025faafc62ed0b75a53e541895ca272815bec18abe2249ff6501c8f2e12b83"},
+    {file = "rpds_py-0.27.1-cp313-cp313-win_arm64.whl", hash = "sha256:ed10dc32829e7d222b7d3b93136d25a406ba9788f6a7ebf6809092da1f4d279d"},
+    {file = "rpds_py-0.27.1-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:92022bbbad0d4426e616815b16bc4127f83c9a74940e1ccf3cfe0b387aba0228"},
+    {file = "rpds_py-0.27.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:47162fdab9407ec3f160805ac3e154df042e577dd53341745fc7fb3f625e6d92"},
+    {file = "rpds_py-0.27.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb89bec23fddc489e5d78b550a7b773557c9ab58b7946154a10a6f7a214a48b2"},
+    {file = "rpds_py-0.27.1-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e48af21883ded2b3e9eb48cb7880ad8598b31ab752ff3be6457001d78f416723"},
+    {file = "rpds_py-0.27.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6f5b7bd8e219ed50299e58551a410b64daafb5017d54bbe822e003856f06a802"},
+    {file = "rpds_py-0.27.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:08f1e20bccf73b08d12d804d6e1c22ca5530e71659e6673bce31a6bb71c1e73f"},
+    {file = "rpds_py-0.27.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0dc5dceeaefcc96dc192e3a80bbe1d6c410c469e97bdd47494a7d930987f18b2"},
+    {file = "rpds_py-0.27.1-cp313-cp313t-manylinux_2_31_riscv64.whl", hash = "sha256:d76f9cc8665acdc0c9177043746775aa7babbf479b5520b78ae4002d889f5c21"},
+    {file = "rpds_py-0.27.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:134fae0e36022edad8290a6661edf40c023562964efea0cc0ec7f5d392d2aaef"},
+    {file = "rpds_py-0.27.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:eb11a4f1b2b63337cfd3b4d110af778a59aae51c81d195768e353d8b52f88081"},
+    {file = "rpds_py-0.27.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:13e608ac9f50a0ed4faec0e90ece76ae33b34c0e8656e3dceb9a7db994c692cd"},
+    {file = "rpds_py-0.27.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:dd2135527aa40f061350c3f8f89da2644de26cd73e4de458e79606384f4f68e7"},
+    {file = "rpds_py-0.27.1-cp313-cp313t-win32.whl", hash = "sha256:3020724ade63fe320a972e2ffd93b5623227e684315adce194941167fee02688"},
+    {file = "rpds_py-0.27.1-cp313-cp313t-win_amd64.whl", hash = "sha256:8ee50c3e41739886606388ba3ab3ee2aae9f35fb23f833091833255a31740797"},
+    {file = "rpds_py-0.27.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:acb9aafccaae278f449d9c713b64a9e68662e7799dbd5859e2c6b3c67b56d334"},
+    {file = "rpds_py-0.27.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:b7fb801aa7f845ddf601c49630deeeccde7ce10065561d92729bfe81bd21fb33"},
+    {file = "rpds_py-0.27.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fe0dd05afb46597b9a2e11c351e5e4283c741237e7f617ffb3252780cca9336a"},
+    {file = "rpds_py-0.27.1-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b6dfb0e058adb12d8b1d1b25f686e94ffa65d9995a5157afe99743bf7369d62b"},
+    {file = "rpds_py-0.27.1-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ed090ccd235f6fa8bb5861684567f0a83e04f52dfc2e5c05f2e4b1309fcf85e7"},
+    {file = "rpds_py-0.27.1-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bf876e79763eecf3e7356f157540d6a093cef395b65514f17a356f62af6cc136"},
+    {file = "rpds_py-0.27.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:12ed005216a51b1d6e2b02a7bd31885fe317e45897de81d86dcce7d74618ffff"},
+    {file = "rpds_py-0.27.1-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:ee4308f409a40e50593c7e3bb8cbe0b4d4c66d1674a316324f0c2f5383b486f9"},
+    {file = "rpds_py-0.27.1-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0b08d152555acf1f455154d498ca855618c1378ec810646fcd7c76416ac6dc60"},
+    {file = "rpds_py-0.27.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:dce51c828941973a5684d458214d3a36fcd28da3e1875d659388f4f9f12cc33e"},
+    {file = "rpds_py-0.27.1-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:c1476d6f29eb81aa4151c9a31219b03f1f798dc43d8af1250a870735516a1212"},
+    {file = "rpds_py-0.27.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:3ce0cac322b0d69b63c9cdb895ee1b65805ec9ffad37639f291dd79467bee675"},
+    {file = "rpds_py-0.27.1-cp314-cp314-win32.whl", hash = "sha256:dfbfac137d2a3d0725758cd141f878bf4329ba25e34979797c89474a89a8a3a3"},
+    {file = "rpds_py-0.27.1-cp314-cp314-win_amd64.whl", hash = "sha256:a6e57b0abfe7cc513450fcf529eb486b6e4d3f8aee83e92eb5f1ef848218d456"},
+    {file = "rpds_py-0.27.1-cp314-cp314-win_arm64.whl", hash = "sha256:faf8d146f3d476abfee026c4ae3bdd9ca14236ae4e4c310cbd1cf75ba33d24a3"},
+    {file = "rpds_py-0.27.1-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:ba81d2b56b6d4911ce735aad0a1d4495e808b8ee4dc58715998741a26874e7c2"},
+    {file = "rpds_py-0.27.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:84f7d509870098de0e864cad0102711c1e24e9b1a50ee713b65928adb22269e4"},
+    {file = "rpds_py-0.27.1-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9e960fc78fecd1100539f14132425e1d5fe44ecb9239f8f27f079962021523e"},
+    {file = "rpds_py-0.27.1-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:62f85b665cedab1a503747617393573995dac4600ff51869d69ad2f39eb5e817"},
+    {file = "rpds_py-0.27.1-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fed467af29776f6556250c9ed85ea5a4dd121ab56a5f8b206e3e7a4c551e48ec"},
+    {file = "rpds_py-0.27.1-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2729615f9d430af0ae6b36cf042cb55c0936408d543fb691e1a9e36648fd35a"},
+    {file = "rpds_py-0.27.1-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1b207d881a9aef7ba753d69c123a35d96ca7cb808056998f6b9e8747321f03b8"},
+    {file = "rpds_py-0.27.1-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:639fd5efec029f99b79ae47e5d7e00ad8a773da899b6309f6786ecaf22948c48"},
+    {file = "rpds_py-0.27.1-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fecc80cb2a90e28af8a9b366edacf33d7a91cbfe4c2c4544ea1246e949cfebeb"},
+    {file = "rpds_py-0.27.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:42a89282d711711d0a62d6f57d81aa43a1368686c45bc1c46b7f079d55692734"},
+    {file = "rpds_py-0.27.1-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:cf9931f14223de59551ab9d38ed18d92f14f055a5f78c1d8ad6493f735021bbb"},
+    {file = "rpds_py-0.27.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:f39f58a27cc6e59f432b568ed8429c7e1641324fbe38131de852cd77b2d534b0"},
+    {file = "rpds_py-0.27.1-cp314-cp314t-win32.whl", hash = "sha256:d5fa0ee122dc09e23607a28e6d7b150da16c662e66409bbe85230e4c85bb528a"},
+    {file = "rpds_py-0.27.1-cp314-cp314t-win_amd64.whl", hash = "sha256:6567d2bb951e21232c2f660c24cf3470bb96de56cdcb3f071a83feeaff8a2772"},
+    {file = "rpds_py-0.27.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:c918c65ec2e42c2a78d19f18c553d77319119bf43aa9e2edf7fb78d624355527"},
+    {file = "rpds_py-0.27.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1fea2b1a922c47c51fd07d656324531adc787e415c8b116530a1d29c0516c62d"},
+    {file = "rpds_py-0.27.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bbf94c58e8e0cd6b6f38d8de67acae41b3a515c26169366ab58bdca4a6883bb8"},
+    {file = "rpds_py-0.27.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c2a8fed130ce946d5c585eddc7c8eeef0051f58ac80a8ee43bd17835c144c2cc"},
+    {file = "rpds_py-0.27.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:037a2361db72ee98d829bc2c5b7cc55598ae0a5e0ec1823a56ea99374cfd73c1"},
+    {file = "rpds_py-0.27.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5281ed1cc1d49882f9997981c88df1a22e140ab41df19071222f7e5fc4e72125"},
+    {file = "rpds_py-0.27.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fd50659a069c15eef8aa3d64bbef0d69fd27bb4a50c9ab4f17f83a16cbf8905"},
+    {file = "rpds_py-0.27.1-cp39-cp39-manylinux_2_31_riscv64.whl", hash = "sha256:c4b676c4ae3921649a15d28ed10025548e9b561ded473aa413af749503c6737e"},
+    {file = "rpds_py-0.27.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:079bc583a26db831a985c5257797b2b5d3affb0386e7ff886256762f82113b5e"},
+    {file = "rpds_py-0.27.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4e44099bd522cba71a2c6b97f68e19f40e7d85399de899d66cdb67b32d7cb786"},
+    {file = "rpds_py-0.27.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e202e6d4188e53c6661af813b46c37ca2c45e497fc558bacc1a7630ec2695aec"},
+    {file = "rpds_py-0.27.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:f41f814b8eaa48768d1bb551591f6ba45f87ac76899453e8ccd41dba1289b04b"},
+    {file = "rpds_py-0.27.1-cp39-cp39-win32.whl", hash = "sha256:9e71f5a087ead99563c11fdaceee83ee982fd39cf67601f4fd66cb386336ee52"},
+    {file = "rpds_py-0.27.1-cp39-cp39-win_amd64.whl", hash = "sha256:71108900c9c3c8590697244b9519017a400d9ba26a36c48381b3f64743a44aab"},
+    {file = "rpds_py-0.27.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:7ba22cb9693df986033b91ae1d7a979bc399237d45fccf875b76f62bb9e52ddf"},
+    {file = "rpds_py-0.27.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:5b640501be9288c77738b5492b3fd3abc4ba95c50c2e41273c8a1459f08298d3"},
+    {file = "rpds_py-0.27.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb08b65b93e0c6dd70aac7f7890a9c0938d5ec71d5cb32d45cf844fb8ae47636"},
+    {file = "rpds_py-0.27.1-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d7ff07d696a7a38152ebdb8212ca9e5baab56656749f3d6004b34ab726b550b8"},
+    {file = "rpds_py-0.27.1-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fb7c72262deae25366e3b6c0c0ba46007967aea15d1eea746e44ddba8ec58dcc"},
+    {file = "rpds_py-0.27.1-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b002cab05d6339716b03a4a3a2ce26737f6231d7b523f339fa061d53368c9d8"},
+    {file = "rpds_py-0.27.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23f6b69d1c26c4704fec01311963a41d7de3ee0570a84ebde4d544e5a1859ffc"},
+    {file = "rpds_py-0.27.1-pp310-pypy310_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:530064db9146b247351f2a0250b8f00b289accea4596a033e94be2389977de71"},
+    {file = "rpds_py-0.27.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7b90b0496570bd6b0321724a330d8b545827c4df2034b6ddfc5f5275f55da2ad"},
+    {file = "rpds_py-0.27.1-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:879b0e14a2da6a1102a3fc8af580fc1ead37e6d6692a781bd8c83da37429b5ab"},
+    {file = "rpds_py-0.27.1-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:0d807710df3b5faa66c731afa162ea29717ab3be17bdc15f90f2d9f183da4059"},
+    {file = "rpds_py-0.27.1-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:3adc388fc3afb6540aec081fa59e6e0d3908722771aa1e37ffe22b220a436f0b"},
+    {file = "rpds_py-0.27.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c796c0c1cc68cb08b0284db4229f5af76168172670c74908fdbd4b7d7f515819"},
+    {file = "rpds_py-0.27.1-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:cdfe4bb2f9fe7458b7453ad3c33e726d6d1c7c0a72960bcc23800d77384e42df"},
+    {file = "rpds_py-0.27.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:8fabb8fd848a5f75a2324e4a84501ee3a5e3c78d8603f83475441866e60b94a3"},
+    {file = "rpds_py-0.27.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eda8719d598f2f7f3e0f885cba8646644b55a187762bec091fa14a2b819746a9"},
+    {file = "rpds_py-0.27.1-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3c64d07e95606ec402a0a1c511fe003873fa6af630bda59bac77fac8b4318ebc"},
+    {file = "rpds_py-0.27.1-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:93a2ed40de81bcff59aabebb626562d48332f3d028ca2036f1d23cbb52750be4"},
+    {file = "rpds_py-0.27.1-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:387ce8c44ae94e0ec50532d9cb0edce17311024c9794eb196b90e1058aadeb66"},
+    {file = "rpds_py-0.27.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aaf94f812c95b5e60ebaf8bfb1898a7d7cb9c1af5744d4a67fa47796e0465d4e"},
+    {file = "rpds_py-0.27.1-pp311-pypy311_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:4848ca84d6ded9b58e474dfdbad4b8bfb450344c0551ddc8d958bf4b36aa837c"},
+    {file = "rpds_py-0.27.1-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2bde09cbcf2248b73c7c323be49b280180ff39fadcfe04e7b6f54a678d02a7cf"},
+    {file = "rpds_py-0.27.1-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:94c44ee01fd21c9058f124d2d4f0c9dc7634bec93cd4b38eefc385dabe71acbf"},
+    {file = "rpds_py-0.27.1-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:df8b74962e35c9249425d90144e721eed198e6555a0e22a563d29fe4486b51f6"},
+    {file = "rpds_py-0.27.1-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:dc23e6820e3b40847e2f4a7726462ba0cf53089512abe9ee16318c366494c17a"},
+    {file = "rpds_py-0.27.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:aa8933159edc50be265ed22b401125c9eebff3171f570258854dbce3ecd55475"},
+    {file = "rpds_py-0.27.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:a50431bf02583e21bf273c71b89d710e7a710ad5e39c725b14e685610555926f"},
+    {file = "rpds_py-0.27.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78af06ddc7fe5cc0e967085a9115accee665fb912c22a3f54bad70cc65b05fe6"},
+    {file = "rpds_py-0.27.1-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:70d0738ef8fee13c003b100c2fbd667ec4f133468109b3472d249231108283a3"},
+    {file = "rpds_py-0.27.1-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e2f6fd8a1cea5bbe599b6e78a6e5ee08db434fc8ffea51ff201c8765679698b3"},
+    {file = "rpds_py-0.27.1-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8177002868d1426305bb5de1e138161c2ec9eb2d939be38291d7c431c4712df8"},
+    {file = "rpds_py-0.27.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:008b839781d6c9bf3b6a8984d1d8e56f0ec46dc56df61fd669c49b58ae800400"},
+    {file = "rpds_py-0.27.1-pp39-pypy39_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:a55b9132bb1ade6c734ddd2759c8dc132aa63687d259e725221f106b83a0e485"},
+    {file = "rpds_py-0.27.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a46fdec0083a26415f11d5f236b79fa1291c32aaa4a17684d82f7017a1f818b1"},
+    {file = "rpds_py-0.27.1-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:8a63b640a7845f2bdd232eb0d0a4a2dd939bcdd6c57e6bb134526487f3160ec5"},
+    {file = "rpds_py-0.27.1-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:7e32721e5d4922deaaf963469d795d5bde6093207c52fec719bd22e5d1bedbc4"},
+    {file = "rpds_py-0.27.1-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:2c426b99a068601b5f4623573df7a7c3d72e87533a2dd2253353a03e7502566c"},
+    {file = "rpds_py-0.27.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:4fc9b7fe29478824361ead6e14e4f5aed570d477e06088826537e202d25fe859"},
+    {file = "rpds_py-0.27.1.tar.gz", hash = "sha256:26a1c73171d10b7acccbded82bf6a586ab8203601e565badc74bbbf8bc5a10f8"},
+]
+
 [[package]]
 name = "ruff"
 version = "0.12.11"
 description = "An extremely fast Python linter and code formatter, written in Rust."
 optional = false
 python-versions = ">=3.7"
-groups = ["dev"]
 files = [
     {file = "ruff-0.12.11-py3-none-linux_armv6l.whl", hash = "sha256:93fce71e1cac3a8bf9200e63a38ac5c078f3b6baebffb74ba5274fb2ab276065"},
     {file = "ruff-0.12.11-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:b8e33ac7b28c772440afa80cebb972ffd823621ded90404f29e5ab6d1e2d4b93"},
@@ -1830,7 +2076,6 @@ version = "1.3.1"
 description = "Sniff out which async library your code is running under"
 optional = false
 python-versions = ">=3.7"
-groups = ["main", "dev"]
 files = [
     {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"},
     {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"},
@@ -1842,8 +2087,6 @@ version = "2.0.43"
 description = "Database Abstraction Library"
 optional = true
 python-versions = ">=3.7"
-groups = ["main"]
-markers = "extra == \"langchain\""
 files = [
     {file = "SQLAlchemy-2.0.43-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:21ba7a08a4253c5825d1db389d4299f64a100ef9800e4624c8bf70d8f136e6ed"},
     {file = "SQLAlchemy-2.0.43-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:11b9503fa6f8721bef9b8567730f664c5a5153d25e247aadc69247c4bc605227"},
@@ -1939,12 +2182,10 @@ version = "9.1.2"
 description = "Retry code until it succeeds"
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "dev"]
 files = [
     {file = "tenacity-9.1.2-py3-none-any.whl", hash = "sha256:f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138"},
     {file = "tenacity-9.1.2.tar.gz", hash = "sha256:1169d376c297e7de388d18b4481760d478b0e99a777cad3a9c86e556f4b697cb"},
 ]
-markers = {main = "extra == \"langchain\""}
 
 [package.extras]
 doc = ["reno", "sphinx"]
@@ -1956,7 +2197,6 @@ version = "0.11.0"
 description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
 files = [
     {file = "tiktoken-0.11.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:8a9b517d6331d7103f8bef29ef93b3cca95fa766e293147fe7bacddf310d5917"},
     {file = "tiktoken-0.11.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b4ddb1849e6bf0afa6cc1c5d809fb980ca240a5fffe585a04e119519758788c0"},
@@ -2004,8 +2244,6 @@ version = "2.2.1"
 description = "A lil' TOML parser"
 optional = false
 python-versions = ">=3.8"
-groups = ["dev"]
-markers = "python_version < \"3.11\""
 files = [
     {file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"},
     {file = "tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6"},
@@ -2047,12 +2285,10 @@ version = "4.67.1"
 description = "Fast, Extensible Progress Meter"
 optional = false
 python-versions = ">=3.7"
-groups = ["main", "dev"]
 files = [
     {file = "tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2"},
     {file = "tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2"},
 ]
-markers = {main = "extra == \"openai\""}
 
 [package.dependencies]
 colorama = {version = "*", markers = "platform_system == \"Windows\""}
@@ -2070,7 +2306,6 @@ version = "4.15.0"
 description = "Backported and Experimental Type Hints for Python 3.9+"
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "dev"]
 files = [
     {file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"},
     {file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"},
@@ -2082,7 +2317,6 @@ version = "0.4.1"
 description = "Runtime typing introspection tools"
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "dev"]
 files = [
     {file = "typing_inspection-0.4.1-py3-none-any.whl", hash = "sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51"},
     {file = "typing_inspection-0.4.1.tar.gz", hash = "sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28"},
@@ -2097,14 +2331,13 @@ version = "2.5.0"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "dev"]
 files = [
     {file = "urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc"},
     {file = "urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760"},
 ]
 
 [package.extras]
-brotli = ["brotli (>=1.0.9) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\""]
+brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"]
 h2 = ["h2 (>=4,<5)"]
 socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
 zstd = ["zstandard (>=0.18.0)"]
@@ -2115,7 +2348,6 @@ version = "20.34.0"
 description = "Virtual Python Environment builder"
 optional = false
 python-versions = ">=3.8"
-groups = ["dev"]
 files = [
     {file = "virtualenv-20.34.0-py3-none-any.whl", hash = "sha256:341f5afa7eee943e4984a9207c025feedd768baff6753cd660c857ceb3e36026"},
     {file = "virtualenv-20.34.0.tar.gz", hash = "sha256:44815b2c9dee7ed86e387b842a84f20b93f7f417f95886ca1996a72a4138eb1a"},
@@ -2129,7 +2361,7 @@ typing-extensions = {version = ">=4.13.2", markers = "python_version < \"3.11\""
 
 [package.extras]
 docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
-test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8) ; platform_python_implementation == \"PyPy\" or platform_python_implementation == \"GraalVM\" or platform_python_implementation == \"CPython\" and sys_platform == \"win32\" and python_version >= \"3.13\"", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10) ; platform_python_implementation == \"CPython\""]
+test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]
 
 [[package]]
 name = "werkzeug"
@@ -2137,7 +2369,6 @@ version = "3.1.3"
 description = "The comprehensive WSGI web application library."
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
 files = [
     {file = "werkzeug-3.1.3-py3-none-any.whl", hash = "sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e"},
     {file = "werkzeug-3.1.3.tar.gz", hash = "sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746"},
@@ -2155,7 +2386,6 @@ version = "1.17.3"
 description = "Module for decorators, wrappers and monkey patching."
 optional = false
 python-versions = ">=3.8"
-groups = ["main"]
 files = [
     {file = "wrapt-1.17.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:88bbae4d40d5a46142e70d58bf664a89b6b4befaea7b2ecc14e03cedb8e06c04"},
     {file = "wrapt-1.17.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e6b13af258d6a9ad602d57d889f83b9d5543acd471eee12eb51f5b01f8eb1bc2"},
@@ -2246,7 +2476,6 @@ version = "3.5.0"
 description = "Python binding for xxHash"
 optional = false
 python-versions = ">=3.7"
-groups = ["dev"]
 files = [
     {file = "xxhash-3.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ece616532c499ee9afbb83078b1b952beffef121d989841f7f4b3dc5ac0fd212"},
     {file = "xxhash-3.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3171f693dbc2cef6477054a665dc255d996646b4023fe56cb4db80e26f4cc520"},
@@ -2379,14 +2608,13 @@ version = "3.23.0"
 description = "Backport of pathlib-compatible object wrapper for zip files"
 optional = false
 python-versions = ">=3.9"
-groups = ["main"]
 files = [
     {file = "zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e"},
     {file = "zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166"},
 ]
 
 [package.extras]
-check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\""]
+check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"]
 cover = ["pytest-cov"]
 doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
 enabler = ["pytest-enabler (>=2.2)"]
@@ -2399,7 +2627,6 @@ version = "0.24.0"
 description = "Zstandard bindings for Python"
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "dev"]
 files = [
     {file = "zstandard-0.24.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:af1394c2c5febc44e0bbf0fc6428263fa928b50d1b1982ce1d870dc793a8e5f4"},
     {file = "zstandard-0.24.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5e941654cef13a1d53634ec30933722eda11f44f99e1d0bc62bbce3387580d50"},
@@ -2501,16 +2728,15 @@ files = [
     {file = "zstandard-0.24.0-cp39-cp39-win_amd64.whl", hash = "sha256:8ecd3b1f7a601f79e0cd20c26057d770219c0dc2f572ea07390248da2def79a4"},
     {file = "zstandard-0.24.0.tar.gz", hash = "sha256:fe3198b81c00032326342d973e526803f183f97aa9e9a98e3f897ebafe21178f"},
 ]
-markers = {main = "extra == \"langchain\""}
 
 [package.extras]
-cffi = ["cffi (>=1.17) ; python_version >= \"3.13\" and platform_python_implementation != \"PyPy\""]
+cffi = ["cffi (>=1.17)"]
 
 [extras]
 langchain = ["langchain"]
 openai = ["openai"]
 
 [metadata]
-lock-version = "2.1"
+lock-version = "2.0"
 python-versions = ">=3.9,<4.0"
-content-hash = "6fe7fed47d629061be2cfcd2a2ea4c83201e5de130faf5f664d68845c2fea22f"
+content-hash = "83ae81e7b9fd90ae8000dc0ac491ff766b899b166a5fc895043d0555267e288c"
diff --git a/pyproject.toml b/pyproject.toml
index 37ff24c6a..70ab88454 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,6 +32,7 @@ ruff = ">=0.1.8,<0.13.0"
 mypy = "^1.0.0"
 langchain-openai = ">=0.0.5,<0.4"
 langgraph = ">=0.2.62,<0.7.0"
+autoevals = "^0.0.130"
 
 [tool.poetry.group.docs.dependencies]
 pdoc = "^15.0.4"

From 52f7d8038908db80c6e9dd205a34c64950ba838c Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Fri, 12 Sep 2025 14:32:04 +0200
Subject: [PATCH 12/25] push

---
 langfuse/_client/client.py      |  2 +
 langfuse/_client/experiments.py | 71 +++++++++++++++++++++++++--------
 langfuse/experiment.py          |  2 +
 3 files changed, 58 insertions(+), 17 deletions(-)

diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py
index 27958b967..45bc773e8 100644
--- a/langfuse/_client/client.py
+++ b/langfuse/_client/client.py
@@ -2726,6 +2726,8 @@ async def process_item(item: ExperimentItem) -> dict:
         self.flush()
 
         return {
+            "name": name,
+            "description": description,
             "item_results": valid_results,
             "run_evaluations": run_evaluations,
             "dataset_run_id": dataset_run_id,
diff --git a/langfuse/_client/experiments.py b/langfuse/_client/experiments.py
index 2e004d686..249c71d2d 100644
--- a/langfuse/_client/experiments.py
+++ b/langfuse/_client/experiments.py
@@ -189,6 +189,8 @@ class ExperimentResult(TypedDict):
         dataset_run_url: URL to view the dataset run in Langfuse UI
     """
 
+    name: str
+    description: Optional[str]
     item_results: List[ExperimentItemResult]
     run_evaluations: List[Evaluation]
     dataset_run_id: Optional[str]
@@ -578,27 +580,62 @@ def performance_distribution(*, item_results, **kwargs):
         ...
 
 
-def format_experiment_results(
-    item_results: List[ExperimentItemResult],
-    run_evaluations: List[Evaluation],
-    experiment_name: str,
-    experiment_description: Optional[str] = None,
-    dataset_run_url: Optional[str] = None,
+def format_experiment_result(
+    experiment_result: ExperimentResult,
+    *,
     include_item_results: bool = False,
 ) -> str:
-    """Format experiment results for display.
+    """Format an experiment result for human-readable display.
+
+    Takes an ExperimentResult object and converts it into a nicely formatted
+    string suitable for console output or logging. The output includes experiment
+    overview, aggregate statistics, and optionally individual item details.
 
     Args:
-        item_results: Results from processing each item
-        run_evaluations: Results from run-level evaluators
-        experiment_name: Name of the experiment
-        experiment_description: Optional description of the experiment
-        dataset_run_url: Optional URL to dataset run in Langfuse UI
-        include_item_results: Whether to include individual item details
+        experiment_result: Complete experiment result containing name, description,
+            item results, run evaluations, and dataset run information.
+        include_item_results: Whether to include detailed results for each individual
+            item in the output. When False (default), only shows aggregate statistics.
+            Set to True to see input/output/scores for every processed item.
 
     Returns:
-        Formatted string representation of the results
+        A formatted multi-line string containing:
+        - Experiment name and description
+        - Number of items processed
+        - List of evaluation metrics used
+        - Average scores across all items
+        - Run-level evaluation results
+        - Dataset run URL (if available)
+        - Individual item details (if include_item_results=True)
+
+    Examples:
+        Basic usage with aggregate results only:
+        ```python
+        result = langfuse.run_experiment(...)
+        print(format_experiment_result(result))
+        ```
+
+        Detailed output including individual items:
+        ```python
+        result = langfuse.run_experiment(...)
+        detailed_report = format_experiment_result(
+            result,
+            include_item_results=True
+        )
+        print(detailed_report)
+        ```
+
+        Save formatted results to file:
+        ```python
+        result = dataset.run_experiment(...)
+        with open("experiment_report.txt", "w") as f:
+            f.write(format_experiment_result(result, include_item_results=True))
+        ```
     """
+    item_results = experiment_result["item_results"]
+    run_evaluations = experiment_result["run_evaluations"]
+    dataset_run_url = experiment_result["dataset_run_url"]
+
     if not item_results:
         return "No experiment results to display."
 
@@ -651,9 +688,9 @@ def format_experiment_results(
 
     # Experiment Overview
     output += f"\n{'─' * 50}\n"
-    output += f"📊 {experiment_name}"
-    if experiment_description:
-        output += f" - {experiment_description}"
+    output += f"📊 {experiment_result['name']}"
+    if experiment_result["description"]:
+        output += f" - {experiment_result['description']}"
 
     output += f"\n{len(item_results)} items"
 
diff --git a/langfuse/experiment.py b/langfuse/experiment.py
index 2d54255e2..8bc953e82 100644
--- a/langfuse/experiment.py
+++ b/langfuse/experiment.py
@@ -9,6 +9,7 @@
     RunEvaluatorFunction,
     TaskFunction,
     create_evaluator_from_autoevals,
+    format_experiment_result,
 )
 
 __all__ = [
@@ -22,4 +23,5 @@
     "EvaluatorFunction",
     "RunEvaluatorFunction",
     "create_evaluator_from_autoevals",
+    "format_experiment_result",
 ]

From 7c583fe7cac570b1eda3378ae9bd3b1f05ec3c02 Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Mon, 15 Sep 2025 13:52:49 +0200
Subject: [PATCH 13/25] push

---
 langfuse/_client/client.py      |  22 +-
 langfuse/_client/datasets.py    |   2 +-
 langfuse/_client/experiments.py | 818 -------------------------------
 langfuse/experiment.py          | 841 +++++++++++++++++++++++++++++++-
 tests/test_core_sdk.py          |   2 +-
 5 files changed, 829 insertions(+), 856 deletions(-)
 delete mode 100644 langfuse/_client/experiments.py

diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py
index 45bc773e8..514e00084 100644
--- a/langfuse/_client/client.py
+++ b/langfuse/_client/client.py
@@ -56,17 +56,6 @@
     LANGFUSE_TRACING_ENABLED,
     LANGFUSE_TRACING_ENVIRONMENT,
 )
-from langfuse._client.experiments import (
-    EvaluatorFunction,
-    ExperimentData,
-    ExperimentItem,
-    ExperimentItemResult,
-    ExperimentResult,
-    RunEvaluatorFunction,
-    TaskFunction,
-    _run_evaluator,
-    _run_task,
-)
 from langfuse._client.resource_manager import LangfuseResourceManager
 from langfuse._client.span import (
     LangfuseAgent,
@@ -92,6 +81,17 @@
     Prompt_Chat,
     Prompt_Text,
 )
+from langfuse.experiment import (
+    EvaluatorFunction,
+    ExperimentData,
+    ExperimentItem,
+    ExperimentItemResult,
+    ExperimentResult,
+    RunEvaluatorFunction,
+    TaskFunction,
+    _run_evaluator,
+    _run_task,
+)
 from langfuse.logger import langfuse_logger
 from langfuse.media import LangfuseMedia
 from langfuse.model import (
diff --git a/langfuse/_client/datasets.py b/langfuse/_client/datasets.py
index 023b7f947..f62c8b0f1 100644
--- a/langfuse/_client/datasets.py
+++ b/langfuse/_client/datasets.py
@@ -4,7 +4,7 @@
 
 from opentelemetry.util._decorator import _agnosticcontextmanager
 
-from langfuse._client.experiments import (
+from langfuse.experiment import (
     EvaluatorFunction,
     RunEvaluatorFunction,
     TaskFunction,
diff --git a/langfuse/_client/experiments.py b/langfuse/_client/experiments.py
deleted file mode 100644
index 249c71d2d..000000000
--- a/langfuse/_client/experiments.py
+++ /dev/null
@@ -1,818 +0,0 @@
-"""Langfuse experiment functionality for running and evaluating tasks on datasets.
-
-This module provides the core experiment functionality for the Langfuse Python SDK,
-allowing users to run experiments on datasets with automatic tracing, evaluation,
-and result formatting.
-"""
-
-import asyncio
-import logging
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Awaitable,
-    Dict,
-    List,
-    Optional,
-    Protocol,
-    TypedDict,
-    Union,
-)
-
-if TYPE_CHECKING:
-    from langfuse._client.datasets import DatasetItemClient
-
-
-class LocalExperimentItem(TypedDict, total=False):
-    """Structure for local experiment data items (not from Langfuse datasets).
-
-    This TypedDict defines the structure for experiment items when using local data
-    rather than Langfuse-hosted datasets. All fields are optional to provide
-    flexibility in data structure.
-
-    Attributes:
-        input: The input data to pass to the task function. Can be any type that
-            your task function can process (string, dict, list, etc.). This is
-            typically the prompt, question, or data that your task will operate on.
-        expected_output: Optional expected/ground truth output for evaluation purposes.
-            Used by evaluators to assess correctness or quality. Can be None if
-            no ground truth is available.
-        metadata: Optional metadata dictionary containing additional context about
-            this specific item. Can include information like difficulty level,
-            category, source, or any other relevant attributes that evaluators
-            might use for context-aware evaluation.
-
-    Examples:
-        Simple text processing item:
-        ```python
-        item: LocalExperimentItem = {
-            "input": "Summarize this article: ...",
-            "expected_output": "Expected summary...",
-            "metadata": {"difficulty": "medium", "category": "news"}
-        }
-        ```
-
-        Classification item:
-        ```python
-        item: LocalExperimentItem = {
-            "input": {"text": "This movie is great!", "context": "movie review"},
-            "expected_output": "positive",
-            "metadata": {"dataset_source": "imdb", "confidence": 0.95}
-        }
-        ```
-
-        Minimal item with only input:
-        ```python
-        item: LocalExperimentItem = {
-            "input": "What is the capital of France?"
-        }
-        ```
-    """
-
-    input: Any
-    expected_output: Any
-    metadata: Optional[Dict[str, Any]]
-
-
-ExperimentItem = Union[LocalExperimentItem, "DatasetItemClient"]
-"""Type alias for items that can be processed in experiments.
-
-Can be either:
-- LocalExperimentItem: Dict-like items with 'input', 'expected_output', 'metadata' keys
-- DatasetItemClient: Items from Langfuse datasets with .input, .expected_output, .metadata attributes
-"""
-
-ExperimentData = Union[List[LocalExperimentItem], List["DatasetItemClient"]]
-"""Type alias for experiment datasets.
-
-Represents the collection of items to process in an experiment. Can be either:
-- List[LocalExperimentItem]: Local data items as dictionaries
-- List[DatasetItemClient]: Items from a Langfuse dataset (typically from dataset.items)
-"""
-
-
-class Evaluation(TypedDict, total=False):
-    """Structure for evaluation results returned by evaluator functions.
-
-    This TypedDict defines the standardized format that all evaluator functions
-    must return. It provides a consistent structure for storing evaluation metrics
-    and their metadata across different types of evaluators.
-
-    Attributes:
-        name: Unique identifier for the evaluation metric. Should be descriptive
-            and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity").
-            Used for aggregation and comparison across experiment runs.
-        value: The evaluation score or result. Can be:
-            - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
-            - String: For categorical results like "positive", "negative", "neutral"
-            - Boolean: For binary assessments like "passes_safety_check"
-            - None: When evaluation cannot be computed (missing data, API errors, etc.)
-        comment: Optional human-readable explanation of the evaluation result.
-            Useful for providing context, explaining scoring rationale, or noting
-            special conditions. Displayed in Langfuse UI for interpretability.
-        metadata: Optional structured metadata about the evaluation process.
-            Can include confidence scores, intermediate calculations, model versions,
-            or any other relevant technical details.
-
-    Examples:
-        Quantitative accuracy evaluation:
-        ```python
-        accuracy_result: Evaluation = {
-            "name": "accuracy",
-            "value": 0.85,
-            "comment": "85% of responses were correct",
-            "metadata": {"total_items": 100, "correct_items": 85}
-        }
-        ```
-
-        Qualitative assessment:
-        ```python
-        sentiment_result: Evaluation = {
-            "name": "sentiment",
-            "value": "positive",
-            "comment": "Response expresses optimistic viewpoint",
-            "metadata": {"confidence": 0.92, "model": "sentiment-analyzer-v2"}
-        }
-        ```
-
-        Binary check:
-        ```python
-        safety_result: Evaluation = {
-            "name": "safety_check",
-            "value": True,
-            "comment": "Content passes all safety filters"
-        }
-        ```
-
-        Failed evaluation:
-        ```python
-        failed_result: Evaluation = {
-            "name": "external_api_score",
-            "value": None,
-            "comment": "External API unavailable",
-            "metadata": {"error": "timeout", "retry_count": 3}
-        }
-        ```
-    """
-
-    name: str
-    value: Union[int, float, str, bool, None]
-    comment: Optional[str]
-    metadata: Optional[Dict[str, Any]]
-
-
-class ExperimentItemResult(TypedDict):
-    """Result structure for individual experiment items.
-
-    Args:
-        item: The original experiment item that was processed
-        output: The actual output produced by the task
-        evaluations: List of evaluation results for this item
-        trace_id: Langfuse trace ID for this item's execution
-        dataset_run_id: Dataset run ID if this item was part of a Langfuse dataset
-    """
-
-    item: ExperimentItem
-    output: Any
-    evaluations: List[Evaluation]
-    trace_id: Optional[str]
-    dataset_run_id: Optional[str]
-
-
-class ExperimentResult(TypedDict):
-    """Complete result structure for experiment execution.
-
-    Args:
-        item_results: Results from processing each individual data item
-        run_evaluations: Results from run-level evaluators
-        dataset_run_id: ID of the dataset run (if using Langfuse datasets)
-        dataset_run_url: URL to view the dataset run in Langfuse UI
-    """
-
-    name: str
-    description: Optional[str]
-    item_results: List[ExperimentItemResult]
-    run_evaluations: List[Evaluation]
-    dataset_run_id: Optional[str]
-    dataset_run_url: Optional[str]
-
-
-class TaskFunction(Protocol):
-    """Protocol defining the interface for experiment task functions.
-
-    Task functions are the core processing functions that operate on each item
-    in an experiment dataset. They receive an experiment item as input and
-    produce some output that will be evaluated.
-
-    Task functions must:
-    - Accept 'item' as a keyword argument
-    - Return any type of output (will be passed to evaluators)
-    - Can be either synchronous or asynchronous
-    - Should handle their own errors gracefully (exceptions will be logged)
-    """
-
-    def __call__(
-        self,
-        *,
-        item: ExperimentItem,
-        **kwargs: Dict[str, Any],
-    ) -> Union[Any, Awaitable[Any]]:
-        """Execute the task on an experiment item.
-
-        This method defines the core processing logic for each item in your experiment.
-        The implementation should focus on the specific task you want to evaluate,
-        such as text generation, classification, summarization, etc.
-
-        Args:
-            item: The experiment item to process. Can be either:
-                - Dict with keys like 'input', 'expected_output', 'metadata'
-                - Langfuse DatasetItem object with .input, .expected_output attributes
-            **kwargs: Additional keyword arguments that may be passed by the framework
-
-        Returns:
-            Any: The output of processing the item. This output will be:
-            - Stored in the experiment results
-            - Passed to all item-level evaluators for assessment
-            - Traced automatically in Langfuse for observability
-
-            Can return either a direct value or an awaitable (async) result.
-
-        Examples:
-            Simple synchronous task:
-            ```python
-            def my_task(*, item, **kwargs):
-                prompt = f"Summarize: {item['input']}"
-                return my_llm_client.generate(prompt)
-            ```
-
-            Async task with error handling:
-            ```python
-            async def my_async_task(*, item, **kwargs):
-                try:
-                    response = await openai_client.chat.completions.create(
-                        model="gpt-4",
-                        messages=[{"role": "user", "content": item["input"]}]
-                    )
-                    return response.choices[0].message.content
-                except Exception as e:
-                    # Log error and return fallback
-                    print(f"Task failed for item {item}: {e}")
-                    return "Error: Could not process item"
-            ```
-
-            Task using dataset item attributes:
-            ```python
-            def classification_task(*, item, **kwargs):
-                # Works with both dict items and DatasetItem objects
-                text = item["input"] if isinstance(item, dict) else item.input
-                return classify_text(text)
-            ```
-        """
-        ...
-
-
-class EvaluatorFunction(Protocol):
-    """Protocol defining the interface for item-level evaluator functions.
-
-    Item-level evaluators assess the quality, correctness, or other properties
-    of individual task outputs. They receive the input, output, expected output,
-    and metadata for each item and return evaluation metrics.
-
-    Evaluators should:
-    - Accept input, output, expected_output, and metadata as keyword arguments
-    - Return Evaluation dict(s) with 'name', 'value', 'comment', 'metadata' fields
-    - Be deterministic when possible for reproducible results
-    - Handle edge cases gracefully (missing expected output, malformed data, etc.)
-    - Can be either synchronous or asynchronous
-    """
-
-    def __call__(
-        self,
-        *,
-        input: Any,
-        output: Any,
-        expected_output: Any,
-        metadata: Optional[Dict[str, Any]],
-        **kwargs: Dict[str, Any],
-    ) -> Union[
-        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
-    ]:
-        """Evaluate a task output for quality, correctness, or other metrics.
-
-        This method should implement specific evaluation logic such as accuracy checking,
-        similarity measurement, toxicity detection, fluency assessment, etc.
-
-        Args:
-            input: The original input that was passed to the task function.
-                This is typically the item['input'] or item.input value.
-            output: The output produced by the task function for this input.
-                This is the direct return value from your task function.
-            expected_output: The expected/ground truth output for comparison.
-                May be None if not available in the dataset. Evaluators should
-                handle this case appropriately.
-            metadata: Optional metadata from the experiment item that might
-                contain additional context for evaluation (categories, difficulty, etc.)
-            **kwargs: Additional keyword arguments that may be passed by the framework
-
-        Returns:
-            Evaluation results in one of these formats:
-            - Single Evaluation dict: {"name": "accuracy", "value": 0.85, "comment": "..."}
-            - List of Evaluation dicts: [{"name": "precision", ...}, {"name": "recall", ...}]
-            - Awaitable returning either of the above (for async evaluators)
-
-            Each Evaluation dict should contain:
-            - name (str): Unique identifier for this evaluation metric
-            - value (int|float|str|bool): The evaluation score or result
-            - comment (str, optional): Human-readable explanation of the result
-            - metadata (dict, optional): Additional structured data about the evaluation
-
-        Examples:
-            Simple accuracy evaluator:
-            ```python
-            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
-                if expected_output is None:
-                    return {"name": "accuracy", "value": None, "comment": "No expected output"}
-
-                is_correct = output.strip().lower() == expected_output.strip().lower()
-                return {
-                    "name": "accuracy",
-                    "value": 1.0 if is_correct else 0.0,
-                    "comment": "Exact match" if is_correct else "No match"
-                }
-            ```
-
-            Multi-metric evaluator:
-            ```python
-            def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
-                results = []
-
-                # Length check
-                results.append({
-                    "name": "output_length",
-                    "value": len(output),
-                    "comment": f"Output contains {len(output)} characters"
-                })
-
-                # Sentiment analysis
-                sentiment_score = analyze_sentiment(output)
-                results.append({
-                    "name": "sentiment",
-                    "value": sentiment_score,
-                    "comment": f"Sentiment score: {sentiment_score:.2f}"
-                })
-
-                return results
-            ```
-
-            Async evaluator using external API:
-            ```python
-            async def llm_judge_evaluator(*, input, output, expected_output=None, **kwargs):
-                prompt = f"Rate the quality of this response on a scale of 1-10:\n"
-                prompt += f"Question: {input}\nResponse: {output}"
-
-                response = await openai_client.chat.completions.create(
-                    model="gpt-4",
-                    messages=[{"role": "user", "content": prompt}]
-                )
-
-                try:
-                    score = float(response.choices[0].message.content.strip())
-                    return {
-                        "name": "llm_judge_quality",
-                        "value": score,
-                        "comment": f"LLM judge rated this {score}/10"
-                    }
-                except ValueError:
-                    return {
-                        "name": "llm_judge_quality",
-                        "value": None,
-                        "comment": "Could not parse LLM judge score"
-                    }
-            ```
-
-            Context-aware evaluator:
-            ```python
-            def context_evaluator(*, input, output, metadata=None, **kwargs):
-                # Use metadata for context-specific evaluation
-                difficulty = metadata.get("difficulty", "medium") if metadata else "medium"
-
-                # Adjust expectations based on difficulty
-                min_length = {"easy": 50, "medium": 100, "hard": 150}[difficulty]
-
-                meets_requirement = len(output) >= min_length
-                return {
-                    "name": f"meets_{difficulty}_requirement",
-                    "value": meets_requirement,
-                    "comment": f"Output {'meets' if meets_requirement else 'fails'} {difficulty} length requirement"
-                }
-            ```
-        """
-        ...
-
-
-class RunEvaluatorFunction(Protocol):
-    """Protocol defining the interface for run-level evaluator functions.
-
-    Run-level evaluators assess aggregate properties of the entire experiment run,
-    computing metrics that span across all items rather than individual outputs.
-    They receive the complete results from all processed items and can compute
-    statistics like averages, distributions, correlations, or other aggregate metrics.
-
-    Run evaluators should:
-    - Accept item_results as a keyword argument containing all item results
-    - Return Evaluation dict(s) with aggregate metrics
-    - Handle cases where some items may have failed processing
-    - Compute meaningful statistics across the dataset
-    - Can be either synchronous or asynchronous
-    """
-
-    def __call__(
-        self,
-        *,
-        item_results: List[ExperimentItemResult],
-        **kwargs: Dict[str, Any],
-    ) -> Union[
-        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
-    ]:
-        """Evaluate the entire experiment run with aggregate metrics.
-
-        This method should implement aggregate evaluation logic such as computing
-        averages, calculating distributions, finding correlations, detecting patterns
-        across items, or performing statistical analysis on the experiment results.
-
-        Args:
-            item_results: List of results from all successfully processed experiment items.
-                Each item result contains:
-                - item: The original experiment item
-                - output: The task function's output for this item
-                - evaluations: List of item-level evaluation results
-                - trace_id: Langfuse trace ID for this execution
-                - dataset_run_id: Dataset run ID (if using Langfuse datasets)
-
-                Note: This list only includes items that were successfully processed.
-                Failed items are excluded but logged separately.
-            **kwargs: Additional keyword arguments that may be passed by the framework
-
-        Returns:
-            Evaluation results in one of these formats:
-            - Single Evaluation dict: {"name": "avg_accuracy", "value": 0.78, "comment": "..."}
-            - List of Evaluation dicts: [{"name": "mean", ...}, {"name": "std_dev", ...}]
-            - Awaitable returning either of the above (for async evaluators)
-
-            Each Evaluation dict should contain:
-            - name (str): Unique identifier for this run-level metric
-            - value (int|float|str|bool): The aggregate evaluation result
-            - comment (str, optional): Human-readable explanation of the metric
-            - metadata (dict, optional): Additional structured data about the evaluation
-
-        Examples:
-            Average accuracy calculator:
-            ```python
-            def average_accuracy(*, item_results, **kwargs):
-                if not item_results:
-                    return {"name": "avg_accuracy", "value": 0.0, "comment": "No results"}
-
-                accuracy_values = []
-                for result in item_results:
-                    for evaluation in result["evaluations"]:
-                        if evaluation["name"] == "accuracy":
-                            accuracy_values.append(evaluation["value"])
-
-                if not accuracy_values:
-                    return {"name": "avg_accuracy", "value": None, "comment": "No accuracy evaluations found"}
-
-                avg = sum(accuracy_values) / len(accuracy_values)
-                return {
-                    "name": "avg_accuracy",
-                    "value": avg,
-                    "comment": f"Average accuracy across {len(accuracy_values)} items: {avg:.2%}"
-                }
-            ```
-
-            Multiple aggregate metrics:
-            ```python
-            def statistical_summary(*, item_results, **kwargs):
-                if not item_results:
-                    return []
-
-                results = []
-
-                # Calculate output length statistics
-                lengths = [len(str(result["output"])) for result in item_results]
-                results.extend([
-                    {"name": "avg_output_length", "value": sum(lengths) / len(lengths)},
-                    {"name": "min_output_length", "value": min(lengths)},
-                    {"name": "max_output_length", "value": max(lengths)}
-                ])
-
-                # Success rate
-                total_items = len(item_results)  # Only successful items are included
-                results.append({
-                    "name": "processing_success_rate",
-                    "value": 1.0,  # All items in item_results succeeded
-                    "comment": f"Successfully processed {total_items} items"
-                })
-
-                return results
-            ```
-
-            Async run evaluator with external analysis:
-            ```python
-            async def llm_batch_analysis(*, item_results, **kwargs):
-                # Prepare batch analysis prompt
-                outputs = [result["output"] for result in item_results]
-                prompt = f"Analyze these {len(outputs)} outputs for common themes:\n"
-                prompt += "\n".join(f"{i+1}. {output}" for i, output in enumerate(outputs))
-
-                response = await openai_client.chat.completions.create(
-                    model="gpt-4",
-                    messages=[{"role": "user", "content": prompt}]
-                )
-
-                return {
-                    "name": "thematic_analysis",
-                    "value": response.choices[0].message.content,
-                    "comment": f"LLM analysis of {len(outputs)} outputs"
-                }
-            ```
-
-            Performance distribution analysis:
-            ```python
-            def performance_distribution(*, item_results, **kwargs):
-                # Extract all evaluation scores
-                all_scores = []
-                score_by_metric = {}
-
-                for result in item_results:
-                    for evaluation in result["evaluations"]:
-                        metric_name = evaluation["name"]
-                        value = evaluation["value"]
-
-                        if isinstance(value, (int, float)):
-                            all_scores.append(value)
-                            if metric_name not in score_by_metric:
-                                score_by_metric[metric_name] = []
-                            score_by_metric[metric_name].append(value)
-
-                results = []
-
-                # Overall score distribution
-                if all_scores:
-                    import statistics
-                    results.append({
-                        "name": "score_std_dev",
-                        "value": statistics.stdev(all_scores) if len(all_scores) > 1 else 0,
-                        "comment": f"Standard deviation across all numeric scores"
-                    })
-
-                # Per-metric statistics
-                for metric, scores in score_by_metric.items():
-                    if len(scores) > 1:
-                        results.append({
-                            "name": f"{metric}_variance",
-                            "value": statistics.variance(scores),
-                            "comment": f"Variance in {metric} across {len(scores)} items"
-                        })
-
-                return results
-            ```
-        """
-        ...
-
-
-def format_experiment_result(
-    experiment_result: ExperimentResult,
-    *,
-    include_item_results: bool = False,
-) -> str:
-    """Format an experiment result for human-readable display.
-
-    Takes an ExperimentResult object and converts it into a nicely formatted
-    string suitable for console output or logging. The output includes experiment
-    overview, aggregate statistics, and optionally individual item details.
-
-    Args:
-        experiment_result: Complete experiment result containing name, description,
-            item results, run evaluations, and dataset run information.
-        include_item_results: Whether to include detailed results for each individual
-            item in the output. When False (default), only shows aggregate statistics.
-            Set to True to see input/output/scores for every processed item.
-
-    Returns:
-        A formatted multi-line string containing:
-        - Experiment name and description
-        - Number of items processed
-        - List of evaluation metrics used
-        - Average scores across all items
-        - Run-level evaluation results
-        - Dataset run URL (if available)
-        - Individual item details (if include_item_results=True)
-
-    Examples:
-        Basic usage with aggregate results only:
-        ```python
-        result = langfuse.run_experiment(...)
-        print(format_experiment_result(result))
-        ```
-
-        Detailed output including individual items:
-        ```python
-        result = langfuse.run_experiment(...)
-        detailed_report = format_experiment_result(
-            result,
-            include_item_results=True
-        )
-        print(detailed_report)
-        ```
-
-        Save formatted results to file:
-        ```python
-        result = dataset.run_experiment(...)
-        with open("experiment_report.txt", "w") as f:
-            f.write(format_experiment_result(result, include_item_results=True))
-        ```
-    """
-    item_results = experiment_result["item_results"]
-    run_evaluations = experiment_result["run_evaluations"]
-    dataset_run_url = experiment_result["dataset_run_url"]
-
-    if not item_results:
-        return "No experiment results to display."
-
-    output = ""
-
-    # Individual results
-    if include_item_results:
-        for i, result in enumerate(item_results):
-            output += f"\n{i + 1}. Item {i + 1}:\n"
-
-            # Input, expected, and actual
-            item_input = None
-            if isinstance(result["item"], dict):
-                item_input = result["item"].get("input")
-            elif hasattr(result["item"], "input"):
-                item_input = result["item"].input
-
-            if item_input is not None:
-                output += f"   Input:    {_format_value(item_input)}\n"
-
-            expected_output = None
-            if isinstance(result["item"], dict):
-                expected_output = result["item"].get("expected_output")
-            elif hasattr(result["item"], "expected_output"):
-                expected_output = result["item"].expected_output
-
-            if expected_output is not None:
-                output += f"   Expected: {_format_value(expected_output)}\n"
-            output += f"   Actual:   {_format_value(result['output'])}\n"
-
-            # Scores
-            if result["evaluations"]:
-                output += "   Scores:\n"
-                for evaluation in result["evaluations"]:
-                    score = evaluation["value"]
-                    if isinstance(score, (int, float)):
-                        score = f"{score:.3f}"
-                    output += f"     • {evaluation['name']}: {score}"
-                    if evaluation.get("comment"):
-                        output += f"\n       💭 {evaluation['comment']}"
-                    output += "\n"
-
-            # Trace link
-            if result.get("trace_id"):
-                # Note: We'd need the langfuse client to generate the actual URL
-                output += f"\n   Trace ID: {result['trace_id']}\n"
-    else:
-        output += f"Individual Results: Hidden ({len(item_results)} items)\n"
-        output += "💡 Set include_item_results=True to view them\n"
-
-    # Experiment Overview
-    output += f"\n{'─' * 50}\n"
-    output += f"📊 {experiment_result['name']}"
-    if experiment_result["description"]:
-        output += f" - {experiment_result['description']}"
-
-    output += f"\n{len(item_results)} items"
-
-    # Get unique evaluation names
-    evaluation_names = set()
-    for result in item_results:
-        for evaluation in result["evaluations"]:
-            evaluation_names.add(evaluation["name"])
-
-    if evaluation_names:
-        output += "\nEvaluations:"
-        for eval_name in evaluation_names:
-            output += f"\n  • {eval_name}"
-        output += "\n"
-
-    # Average scores
-    if evaluation_names:
-        output += "\nAverage Scores:"
-        for eval_name in evaluation_names:
-            scores = []
-            for result in item_results:
-                for evaluation in result["evaluations"]:
-                    if evaluation["name"] == eval_name and isinstance(
-                        evaluation["value"], (int, float)
-                    ):
-                        scores.append(evaluation["value"])
-
-            if scores:
-                avg = sum(scores) / len(scores)
-                output += f"\n  • {eval_name}: {avg:.3f}"
-        output += "\n"
-
-    # Run evaluations
-    if run_evaluations:
-        output += "\nRun Evaluations:"
-        for run_eval in run_evaluations:
-            score = run_eval["value"]
-            if isinstance(score, (int, float)):
-                score = f"{score:.3f}"
-            output += f"\n  • {run_eval['name']}: {score}"
-            if run_eval.get("comment"):
-                output += f"\n    💭 {run_eval['comment']}"
-        output += "\n"
-
-    if dataset_run_url:
-        output += f"\n🔗 Dataset Run:\n   {dataset_run_url}"
-
-    return output
-
-
-def _format_value(value: Any) -> str:
-    """Format a value for display."""
-    if isinstance(value, str):
-        return value[:50] + "..." if len(value) > 50 else value
-    return str(value)
-
-
-async def _run_evaluator(
-    evaluator: Union[EvaluatorFunction, RunEvaluatorFunction], **kwargs: Any
-) -> List[Evaluation]:
-    """Run an evaluator function and normalize the result."""
-    try:
-        result = evaluator(**kwargs)
-
-        # Handle async evaluators
-        if asyncio.iscoroutine(result):
-            result = await result
-
-        # Normalize to list
-        if isinstance(result, dict):
-            return [result]
-
-        elif isinstance(result, list):
-            return result
-
-        else:
-            return []
-
-    except Exception as e:
-        evaluator_name = getattr(evaluator, "__name__", "unknown_evaluator")
-        logging.getLogger("langfuse").error(f"Evaluator {evaluator_name} failed: {e}")
-        return []
-
-
-async def _run_task(task: TaskFunction, item: ExperimentItem) -> Any:
-    """Run a task function and handle sync/async."""
-    result = task(item=item)
-
-    # Handle async tasks
-    if asyncio.iscoroutine(result):
-        result = await result
-
-    return result
-
-
-def create_evaluator_from_autoevals(
-    autoevals_evaluator: Any, **kwargs: Optional[Dict[str, Any]]
-) -> EvaluatorFunction:
-    """Create a Langfuse evaluator from an autoevals evaluator.
-
-    Args:
-        autoevals_evaluator: An autoevals evaluator instance
-        **kwargs: Additional arguments passed to the evaluator
-
-    Returns:
-        A Langfuse-compatible evaluator function
-    """
-
-    def langfuse_evaluator(
-        *,
-        input: Any,
-        output: Any,
-        expected_output: Any,
-        metadata: Optional[Dict[str, Any]],
-        **kwargs: Dict[str, Any],
-    ) -> Evaluation:
-        evaluation = autoevals_evaluator(
-            input=input, output=output, expected=expected_output, **kwargs
-        )
-
-        return Evaluation(
-            name=evaluation.name, value=evaluation.score, metadata=evaluation.metadata
-        )
-
-    return langfuse_evaluator
diff --git a/langfuse/experiment.py b/langfuse/experiment.py
index 8bc953e82..249c71d2d 100644
--- a/langfuse/experiment.py
+++ b/langfuse/experiment.py
@@ -1,27 +1,818 @@
-from ._client.experiments import (
-    Evaluation,
-    EvaluatorFunction,
-    ExperimentData,
-    ExperimentItem,
-    ExperimentItemResult,
-    ExperimentResult,
-    LocalExperimentItem,
-    RunEvaluatorFunction,
-    TaskFunction,
-    create_evaluator_from_autoevals,
-    format_experiment_result,
+"""Langfuse experiment functionality for running and evaluating tasks on datasets.
+
+This module provides the core experiment functionality for the Langfuse Python SDK,
+allowing users to run experiments on datasets with automatic tracing, evaluation,
+and result formatting.
+"""
+
+import asyncio
+import logging
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Awaitable,
+    Dict,
+    List,
+    Optional,
+    Protocol,
+    TypedDict,
+    Union,
 )
 
-__all__ = [
-    "LocalExperimentItem",
-    "ExperimentItem",
-    "ExperimentData",
-    "Evaluation",
-    "ExperimentItemResult",
-    "ExperimentResult",
-    "TaskFunction",
-    "EvaluatorFunction",
-    "RunEvaluatorFunction",
-    "create_evaluator_from_autoevals",
-    "format_experiment_result",
-]
+if TYPE_CHECKING:
+    from langfuse._client.datasets import DatasetItemClient
+
+
+class LocalExperimentItem(TypedDict, total=False):
+    """Structure for local experiment data items (not from Langfuse datasets).
+
+    This TypedDict defines the structure for experiment items when using local data
+    rather than Langfuse-hosted datasets. All fields are optional to provide
+    flexibility in data structure.
+
+    Attributes:
+        input: The input data to pass to the task function. Can be any type that
+            your task function can process (string, dict, list, etc.). This is
+            typically the prompt, question, or data that your task will operate on.
+        expected_output: Optional expected/ground truth output for evaluation purposes.
+            Used by evaluators to assess correctness or quality. Can be None if
+            no ground truth is available.
+        metadata: Optional metadata dictionary containing additional context about
+            this specific item. Can include information like difficulty level,
+            category, source, or any other relevant attributes that evaluators
+            might use for context-aware evaluation.
+
+    Examples:
+        Simple text processing item:
+        ```python
+        item: LocalExperimentItem = {
+            "input": "Summarize this article: ...",
+            "expected_output": "Expected summary...",
+            "metadata": {"difficulty": "medium", "category": "news"}
+        }
+        ```
+
+        Classification item:
+        ```python
+        item: LocalExperimentItem = {
+            "input": {"text": "This movie is great!", "context": "movie review"},
+            "expected_output": "positive",
+            "metadata": {"dataset_source": "imdb", "confidence": 0.95}
+        }
+        ```
+
+        Minimal item with only input:
+        ```python
+        item: LocalExperimentItem = {
+            "input": "What is the capital of France?"
+        }
+        ```
+    """
+
+    input: Any
+    expected_output: Any
+    metadata: Optional[Dict[str, Any]]
+
+
+ExperimentItem = Union[LocalExperimentItem, "DatasetItemClient"]
+"""Type alias for items that can be processed in experiments.
+
+Can be either:
+- LocalExperimentItem: Dict-like items with 'input', 'expected_output', 'metadata' keys
+- DatasetItemClient: Items from Langfuse datasets with .input, .expected_output, .metadata attributes
+"""
+
+ExperimentData = Union[List[LocalExperimentItem], List["DatasetItemClient"]]
+"""Type alias for experiment datasets.
+
+Represents the collection of items to process in an experiment. Can be either:
+- List[LocalExperimentItem]: Local data items as dictionaries
+- List[DatasetItemClient]: Items from a Langfuse dataset (typically from dataset.items)
+"""
+
+
+class Evaluation(TypedDict, total=False):
+    """Structure for evaluation results returned by evaluator functions.
+
+    This TypedDict defines the standardized format that all evaluator functions
+    must return. It provides a consistent structure for storing evaluation metrics
+    and their metadata across different types of evaluators.
+
+    Attributes:
+        name: Unique identifier for the evaluation metric. Should be descriptive
+            and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity").
+            Used for aggregation and comparison across experiment runs.
+        value: The evaluation score or result. Can be:
+            - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
+            - String: For categorical results like "positive", "negative", "neutral"
+            - Boolean: For binary assessments like "passes_safety_check"
+            - None: When evaluation cannot be computed (missing data, API errors, etc.)
+        comment: Optional human-readable explanation of the evaluation result.
+            Useful for providing context, explaining scoring rationale, or noting
+            special conditions. Displayed in Langfuse UI for interpretability.
+        metadata: Optional structured metadata about the evaluation process.
+            Can include confidence scores, intermediate calculations, model versions,
+            or any other relevant technical details.
+
+    Examples:
+        Quantitative accuracy evaluation:
+        ```python
+        accuracy_result: Evaluation = {
+            "name": "accuracy",
+            "value": 0.85,
+            "comment": "85% of responses were correct",
+            "metadata": {"total_items": 100, "correct_items": 85}
+        }
+        ```
+
+        Qualitative assessment:
+        ```python
+        sentiment_result: Evaluation = {
+            "name": "sentiment",
+            "value": "positive",
+            "comment": "Response expresses optimistic viewpoint",
+            "metadata": {"confidence": 0.92, "model": "sentiment-analyzer-v2"}
+        }
+        ```
+
+        Binary check:
+        ```python
+        safety_result: Evaluation = {
+            "name": "safety_check",
+            "value": True,
+            "comment": "Content passes all safety filters"
+        }
+        ```
+
+        Failed evaluation:
+        ```python
+        failed_result: Evaluation = {
+            "name": "external_api_score",
+            "value": None,
+            "comment": "External API unavailable",
+            "metadata": {"error": "timeout", "retry_count": 3}
+        }
+        ```
+    """
+
+    name: str
+    value: Union[int, float, str, bool, None]
+    comment: Optional[str]
+    metadata: Optional[Dict[str, Any]]
+
+
+class ExperimentItemResult(TypedDict):
+    """Result structure for individual experiment items.
+
+    Args:
+        item: The original experiment item that was processed
+        output: The actual output produced by the task
+        evaluations: List of evaluation results for this item
+        trace_id: Langfuse trace ID for this item's execution
+        dataset_run_id: Dataset run ID if this item was part of a Langfuse dataset
+    """
+
+    item: ExperimentItem
+    output: Any
+    evaluations: List[Evaluation]
+    trace_id: Optional[str]
+    dataset_run_id: Optional[str]
+
+
+class ExperimentResult(TypedDict):
+    """Complete result structure for experiment execution.
+
+    Args:
+        item_results: Results from processing each individual data item
+        run_evaluations: Results from run-level evaluators
+        dataset_run_id: ID of the dataset run (if using Langfuse datasets)
+        dataset_run_url: URL to view the dataset run in Langfuse UI
+    """
+
+    name: str
+    description: Optional[str]
+    item_results: List[ExperimentItemResult]
+    run_evaluations: List[Evaluation]
+    dataset_run_id: Optional[str]
+    dataset_run_url: Optional[str]
+
+
+class TaskFunction(Protocol):
+    """Protocol defining the interface for experiment task functions.
+
+    Task functions are the core processing functions that operate on each item
+    in an experiment dataset. They receive an experiment item as input and
+    produce some output that will be evaluated.
+
+    Task functions must:
+    - Accept 'item' as a keyword argument
+    - Return any type of output (will be passed to evaluators)
+    - Can be either synchronous or asynchronous
+    - Should handle their own errors gracefully (exceptions will be logged)
+    """
+
+    def __call__(
+        self,
+        *,
+        item: ExperimentItem,
+        **kwargs: Dict[str, Any],
+    ) -> Union[Any, Awaitable[Any]]:
+        """Execute the task on an experiment item.
+
+        This method defines the core processing logic for each item in your experiment.
+        The implementation should focus on the specific task you want to evaluate,
+        such as text generation, classification, summarization, etc.
+
+        Args:
+            item: The experiment item to process. Can be either:
+                - Dict with keys like 'input', 'expected_output', 'metadata'
+                - Langfuse DatasetItem object with .input, .expected_output attributes
+            **kwargs: Additional keyword arguments that may be passed by the framework
+
+        Returns:
+            Any: The output of processing the item. This output will be:
+            - Stored in the experiment results
+            - Passed to all item-level evaluators for assessment
+            - Traced automatically in Langfuse for observability
+
+            Can return either a direct value or an awaitable (async) result.
+
+        Examples:
+            Simple synchronous task:
+            ```python
+            def my_task(*, item, **kwargs):
+                prompt = f"Summarize: {item['input']}"
+                return my_llm_client.generate(prompt)
+            ```
+
+            Async task with error handling:
+            ```python
+            async def my_async_task(*, item, **kwargs):
+                try:
+                    response = await openai_client.chat.completions.create(
+                        model="gpt-4",
+                        messages=[{"role": "user", "content": item["input"]}]
+                    )
+                    return response.choices[0].message.content
+                except Exception as e:
+                    # Log error and return fallback
+                    print(f"Task failed for item {item}: {e}")
+                    return "Error: Could not process item"
+            ```
+
+            Task using dataset item attributes:
+            ```python
+            def classification_task(*, item, **kwargs):
+                # Works with both dict items and DatasetItem objects
+                text = item["input"] if isinstance(item, dict) else item.input
+                return classify_text(text)
+            ```
+        """
+        ...
+
+
+class EvaluatorFunction(Protocol):
+    """Protocol defining the interface for item-level evaluator functions.
+
+    Item-level evaluators assess the quality, correctness, or other properties
+    of individual task outputs. They receive the input, output, expected output,
+    and metadata for each item and return evaluation metrics.
+
+    Evaluators should:
+    - Accept input, output, expected_output, and metadata as keyword arguments
+    - Return Evaluation dict(s) with 'name', 'value', 'comment', 'metadata' fields
+    - Be deterministic when possible for reproducible results
+    - Handle edge cases gracefully (missing expected output, malformed data, etc.)
+    - Can be either synchronous or asynchronous
+    """
+
+    def __call__(
+        self,
+        *,
+        input: Any,
+        output: Any,
+        expected_output: Any,
+        metadata: Optional[Dict[str, Any]],
+        **kwargs: Dict[str, Any],
+    ) -> Union[
+        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
+    ]:
+        """Evaluate a task output for quality, correctness, or other metrics.
+
+        This method should implement specific evaluation logic such as accuracy checking,
+        similarity measurement, toxicity detection, fluency assessment, etc.
+
+        Args:
+            input: The original input that was passed to the task function.
+                This is typically the item['input'] or item.input value.
+            output: The output produced by the task function for this input.
+                This is the direct return value from your task function.
+            expected_output: The expected/ground truth output for comparison.
+                May be None if not available in the dataset. Evaluators should
+                handle this case appropriately.
+            metadata: Optional metadata from the experiment item that might
+                contain additional context for evaluation (categories, difficulty, etc.)
+            **kwargs: Additional keyword arguments that may be passed by the framework
+
+        Returns:
+            Evaluation results in one of these formats:
+            - Single Evaluation dict: {"name": "accuracy", "value": 0.85, "comment": "..."}
+            - List of Evaluation dicts: [{"name": "precision", ...}, {"name": "recall", ...}]
+            - Awaitable returning either of the above (for async evaluators)
+
+            Each Evaluation dict should contain:
+            - name (str): Unique identifier for this evaluation metric
+            - value (int|float|str|bool): The evaluation score or result
+            - comment (str, optional): Human-readable explanation of the result
+            - metadata (dict, optional): Additional structured data about the evaluation
+
+        Examples:
+            Simple accuracy evaluator:
+            ```python
+            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
+                if expected_output is None:
+                    return {"name": "accuracy", "value": None, "comment": "No expected output"}
+
+                is_correct = output.strip().lower() == expected_output.strip().lower()
+                return {
+                    "name": "accuracy",
+                    "value": 1.0 if is_correct else 0.0,
+                    "comment": "Exact match" if is_correct else "No match"
+                }
+            ```
+
+            Multi-metric evaluator:
+            ```python
+            def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
+                results = []
+
+                # Length check
+                results.append({
+                    "name": "output_length",
+                    "value": len(output),
+                    "comment": f"Output contains {len(output)} characters"
+                })
+
+                # Sentiment analysis
+                sentiment_score = analyze_sentiment(output)
+                results.append({
+                    "name": "sentiment",
+                    "value": sentiment_score,
+                    "comment": f"Sentiment score: {sentiment_score:.2f}"
+                })
+
+                return results
+            ```
+
+            Async evaluator using external API:
+            ```python
+            async def llm_judge_evaluator(*, input, output, expected_output=None, **kwargs):
+                prompt = f"Rate the quality of this response on a scale of 1-10:\n"
+                prompt += f"Question: {input}\nResponse: {output}"
+
+                response = await openai_client.chat.completions.create(
+                    model="gpt-4",
+                    messages=[{"role": "user", "content": prompt}]
+                )
+
+                try:
+                    score = float(response.choices[0].message.content.strip())
+                    return {
+                        "name": "llm_judge_quality",
+                        "value": score,
+                        "comment": f"LLM judge rated this {score}/10"
+                    }
+                except ValueError:
+                    return {
+                        "name": "llm_judge_quality",
+                        "value": None,
+                        "comment": "Could not parse LLM judge score"
+                    }
+            ```
+
+            Context-aware evaluator:
+            ```python
+            def context_evaluator(*, input, output, metadata=None, **kwargs):
+                # Use metadata for context-specific evaluation
+                difficulty = metadata.get("difficulty", "medium") if metadata else "medium"
+
+                # Adjust expectations based on difficulty
+                min_length = {"easy": 50, "medium": 100, "hard": 150}[difficulty]
+
+                meets_requirement = len(output) >= min_length
+                return {
+                    "name": f"meets_{difficulty}_requirement",
+                    "value": meets_requirement,
+                    "comment": f"Output {'meets' if meets_requirement else 'fails'} {difficulty} length requirement"
+                }
+            ```
+        """
+        ...
+
+
+class RunEvaluatorFunction(Protocol):
+    """Protocol defining the interface for run-level evaluator functions.
+
+    Run-level evaluators assess aggregate properties of the entire experiment run,
+    computing metrics that span across all items rather than individual outputs.
+    They receive the complete results from all processed items and can compute
+    statistics like averages, distributions, correlations, or other aggregate metrics.
+
+    Run evaluators should:
+    - Accept item_results as a keyword argument containing all item results
+    - Return Evaluation dict(s) with aggregate metrics
+    - Handle cases where some items may have failed processing
+    - Compute meaningful statistics across the dataset
+    - Can be either synchronous or asynchronous
+    """
+
+    def __call__(
+        self,
+        *,
+        item_results: List[ExperimentItemResult],
+        **kwargs: Dict[str, Any],
+    ) -> Union[
+        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
+    ]:
+        """Evaluate the entire experiment run with aggregate metrics.
+
+        This method should implement aggregate evaluation logic such as computing
+        averages, calculating distributions, finding correlations, detecting patterns
+        across items, or performing statistical analysis on the experiment results.
+
+        Args:
+            item_results: List of results from all successfully processed experiment items.
+                Each item result contains:
+                - item: The original experiment item
+                - output: The task function's output for this item
+                - evaluations: List of item-level evaluation results
+                - trace_id: Langfuse trace ID for this execution
+                - dataset_run_id: Dataset run ID (if using Langfuse datasets)
+
+                Note: This list only includes items that were successfully processed.
+                Failed items are excluded but logged separately.
+            **kwargs: Additional keyword arguments that may be passed by the framework
+
+        Returns:
+            Evaluation results in one of these formats:
+            - Single Evaluation dict: {"name": "avg_accuracy", "value": 0.78, "comment": "..."}
+            - List of Evaluation dicts: [{"name": "mean", ...}, {"name": "std_dev", ...}]
+            - Awaitable returning either of the above (for async evaluators)
+
+            Each Evaluation dict should contain:
+            - name (str): Unique identifier for this run-level metric
+            - value (int|float|str|bool): The aggregate evaluation result
+            - comment (str, optional): Human-readable explanation of the metric
+            - metadata (dict, optional): Additional structured data about the evaluation
+
+        Examples:
+            Average accuracy calculator:
+            ```python
+            def average_accuracy(*, item_results, **kwargs):
+                if not item_results:
+                    return {"name": "avg_accuracy", "value": 0.0, "comment": "No results"}
+
+                accuracy_values = []
+                for result in item_results:
+                    for evaluation in result["evaluations"]:
+                        if evaluation["name"] == "accuracy":
+                            accuracy_values.append(evaluation["value"])
+
+                if not accuracy_values:
+                    return {"name": "avg_accuracy", "value": None, "comment": "No accuracy evaluations found"}
+
+                avg = sum(accuracy_values) / len(accuracy_values)
+                return {
+                    "name": "avg_accuracy",
+                    "value": avg,
+                    "comment": f"Average accuracy across {len(accuracy_values)} items: {avg:.2%}"
+                }
+            ```
+
+            Multiple aggregate metrics:
+            ```python
+            def statistical_summary(*, item_results, **kwargs):
+                if not item_results:
+                    return []
+
+                results = []
+
+                # Calculate output length statistics
+                lengths = [len(str(result["output"])) for result in item_results]
+                results.extend([
+                    {"name": "avg_output_length", "value": sum(lengths) / len(lengths)},
+                    {"name": "min_output_length", "value": min(lengths)},
+                    {"name": "max_output_length", "value": max(lengths)}
+                ])
+
+                # Success rate
+                total_items = len(item_results)  # Only successful items are included
+                results.append({
+                    "name": "processing_success_rate",
+                    "value": 1.0,  # All items in item_results succeeded
+                    "comment": f"Successfully processed {total_items} items"
+                })
+
+                return results
+            ```
+
+            Async run evaluator with external analysis:
+            ```python
+            async def llm_batch_analysis(*, item_results, **kwargs):
+                # Prepare batch analysis prompt
+                outputs = [result["output"] for result in item_results]
+                prompt = f"Analyze these {len(outputs)} outputs for common themes:\n"
+                prompt += "\n".join(f"{i+1}. {output}" for i, output in enumerate(outputs))
+
+                response = await openai_client.chat.completions.create(
+                    model="gpt-4",
+                    messages=[{"role": "user", "content": prompt}]
+                )
+
+                return {
+                    "name": "thematic_analysis",
+                    "value": response.choices[0].message.content,
+                    "comment": f"LLM analysis of {len(outputs)} outputs"
+                }
+            ```
+
+            Performance distribution analysis:
+            ```python
+            def performance_distribution(*, item_results, **kwargs):
+                # Extract all evaluation scores
+                all_scores = []
+                score_by_metric = {}
+
+                for result in item_results:
+                    for evaluation in result["evaluations"]:
+                        metric_name = evaluation["name"]
+                        value = evaluation["value"]
+
+                        if isinstance(value, (int, float)):
+                            all_scores.append(value)
+                            if metric_name not in score_by_metric:
+                                score_by_metric[metric_name] = []
+                            score_by_metric[metric_name].append(value)
+
+                results = []
+
+                # Overall score distribution
+                if all_scores:
+                    import statistics
+                    results.append({
+                        "name": "score_std_dev",
+                        "value": statistics.stdev(all_scores) if len(all_scores) > 1 else 0,
+                        "comment": f"Standard deviation across all numeric scores"
+                    })
+
+                # Per-metric statistics
+                for metric, scores in score_by_metric.items():
+                    if len(scores) > 1:
+                        results.append({
+                            "name": f"{metric}_variance",
+                            "value": statistics.variance(scores),
+                            "comment": f"Variance in {metric} across {len(scores)} items"
+                        })
+
+                return results
+            ```
+        """
+        ...
+
+
+def format_experiment_result(
+    experiment_result: ExperimentResult,
+    *,
+    include_item_results: bool = False,
+) -> str:
+    """Format an experiment result for human-readable display.
+
+    Takes an ExperimentResult object and converts it into a nicely formatted
+    string suitable for console output or logging. The output includes experiment
+    overview, aggregate statistics, and optionally individual item details.
+
+    Args:
+        experiment_result: Complete experiment result containing name, description,
+            item results, run evaluations, and dataset run information.
+        include_item_results: Whether to include detailed results for each individual
+            item in the output. When False (default), only shows aggregate statistics.
+            Set to True to see input/output/scores for every processed item.
+
+    Returns:
+        A formatted multi-line string containing:
+        - Experiment name and description
+        - Number of items processed
+        - List of evaluation metrics used
+        - Average scores across all items
+        - Run-level evaluation results
+        - Dataset run URL (if available)
+        - Individual item details (if include_item_results=True)
+
+    Examples:
+        Basic usage with aggregate results only:
+        ```python
+        result = langfuse.run_experiment(...)
+        print(format_experiment_result(result))
+        ```
+
+        Detailed output including individual items:
+        ```python
+        result = langfuse.run_experiment(...)
+        detailed_report = format_experiment_result(
+            result,
+            include_item_results=True
+        )
+        print(detailed_report)
+        ```
+
+        Save formatted results to file:
+        ```python
+        result = dataset.run_experiment(...)
+        with open("experiment_report.txt", "w") as f:
+            f.write(format_experiment_result(result, include_item_results=True))
+        ```
+    """
+    item_results = experiment_result["item_results"]
+    run_evaluations = experiment_result["run_evaluations"]
+    dataset_run_url = experiment_result["dataset_run_url"]
+
+    if not item_results:
+        return "No experiment results to display."
+
+    output = ""
+
+    # Individual results
+    if include_item_results:
+        for i, result in enumerate(item_results):
+            output += f"\n{i + 1}. Item {i + 1}:\n"
+
+            # Input, expected, and actual
+            item_input = None
+            if isinstance(result["item"], dict):
+                item_input = result["item"].get("input")
+            elif hasattr(result["item"], "input"):
+                item_input = result["item"].input
+
+            if item_input is not None:
+                output += f"   Input:    {_format_value(item_input)}\n"
+
+            expected_output = None
+            if isinstance(result["item"], dict):
+                expected_output = result["item"].get("expected_output")
+            elif hasattr(result["item"], "expected_output"):
+                expected_output = result["item"].expected_output
+
+            if expected_output is not None:
+                output += f"   Expected: {_format_value(expected_output)}\n"
+            output += f"   Actual:   {_format_value(result['output'])}\n"
+
+            # Scores
+            if result["evaluations"]:
+                output += "   Scores:\n"
+                for evaluation in result["evaluations"]:
+                    score = evaluation["value"]
+                    if isinstance(score, (int, float)):
+                        score = f"{score:.3f}"
+                    output += f"     • {evaluation['name']}: {score}"
+                    if evaluation.get("comment"):
+                        output += f"\n       💭 {evaluation['comment']}"
+                    output += "\n"
+
+            # Trace link
+            if result.get("trace_id"):
+                # Note: We'd need the langfuse client to generate the actual URL
+                output += f"\n   Trace ID: {result['trace_id']}\n"
+    else:
+        output += f"Individual Results: Hidden ({len(item_results)} items)\n"
+        output += "💡 Set include_item_results=True to view them\n"
+
+    # Experiment Overview
+    output += f"\n{'─' * 50}\n"
+    output += f"📊 {experiment_result['name']}"
+    if experiment_result["description"]:
+        output += f" - {experiment_result['description']}"
+
+    output += f"\n{len(item_results)} items"
+
+    # Get unique evaluation names
+    evaluation_names = set()
+    for result in item_results:
+        for evaluation in result["evaluations"]:
+            evaluation_names.add(evaluation["name"])
+
+    if evaluation_names:
+        output += "\nEvaluations:"
+        for eval_name in evaluation_names:
+            output += f"\n  • {eval_name}"
+        output += "\n"
+
+    # Average scores
+    if evaluation_names:
+        output += "\nAverage Scores:"
+        for eval_name in evaluation_names:
+            scores = []
+            for result in item_results:
+                for evaluation in result["evaluations"]:
+                    if evaluation["name"] == eval_name and isinstance(
+                        evaluation["value"], (int, float)
+                    ):
+                        scores.append(evaluation["value"])
+
+            if scores:
+                avg = sum(scores) / len(scores)
+                output += f"\n  • {eval_name}: {avg:.3f}"
+        output += "\n"
+
+    # Run evaluations
+    if run_evaluations:
+        output += "\nRun Evaluations:"
+        for run_eval in run_evaluations:
+            score = run_eval["value"]
+            if isinstance(score, (int, float)):
+                score = f"{score:.3f}"
+            output += f"\n  • {run_eval['name']}: {score}"
+            if run_eval.get("comment"):
+                output += f"\n    💭 {run_eval['comment']}"
+        output += "\n"
+
+    if dataset_run_url:
+        output += f"\n🔗 Dataset Run:\n   {dataset_run_url}"
+
+    return output
+
+
+def _format_value(value: Any) -> str:
+    """Format a value for display."""
+    if isinstance(value, str):
+        return value[:50] + "..." if len(value) > 50 else value
+    return str(value)
+
+
+async def _run_evaluator(
+    evaluator: Union[EvaluatorFunction, RunEvaluatorFunction], **kwargs: Any
+) -> List[Evaluation]:
+    """Run an evaluator function and normalize the result."""
+    try:
+        result = evaluator(**kwargs)
+
+        # Handle async evaluators
+        if asyncio.iscoroutine(result):
+            result = await result
+
+        # Normalize to list
+        if isinstance(result, dict):
+            return [result]
+
+        elif isinstance(result, list):
+            return result
+
+        else:
+            return []
+
+    except Exception as e:
+        evaluator_name = getattr(evaluator, "__name__", "unknown_evaluator")
+        logging.getLogger("langfuse").error(f"Evaluator {evaluator_name} failed: {e}")
+        return []
+
+
+async def _run_task(task: TaskFunction, item: ExperimentItem) -> Any:
+    """Run a task function and handle sync/async."""
+    result = task(item=item)
+
+    # Handle async tasks
+    if asyncio.iscoroutine(result):
+        result = await result
+
+    return result
+
+
+def create_evaluator_from_autoevals(
+    autoevals_evaluator: Any, **kwargs: Optional[Dict[str, Any]]
+) -> EvaluatorFunction:
+    """Create a Langfuse evaluator from an autoevals evaluator.
+
+    Args:
+        autoevals_evaluator: An autoevals evaluator instance
+        **kwargs: Additional arguments passed to the evaluator
+
+    Returns:
+        A Langfuse-compatible evaluator function
+    """
+
+    def langfuse_evaluator(
+        *,
+        input: Any,
+        output: Any,
+        expected_output: Any,
+        metadata: Optional[Dict[str, Any]],
+        **kwargs: Dict[str, Any],
+    ) -> Evaluation:
+        evaluation = autoevals_evaluator(
+            input=input, output=output, expected=expected_output, **kwargs
+        )
+
+        return Evaluation(
+            name=evaluation.name, value=evaluation.score, metadata=evaluation.metadata
+        )
+
+    return langfuse_evaluator
diff --git a/tests/test_core_sdk.py b/tests/test_core_sdk.py
index 9a758e38a..f29851d84 100644
--- a/tests/test_core_sdk.py
+++ b/tests/test_core_sdk.py
@@ -1934,8 +1934,8 @@ def test_start_as_current_observation_types():
 def test_that_generation_like_properties_are_actually_created():
     """Test that generation-like observation types properly support generation properties."""
     from langfuse._client.constants import (
-        get_observation_types_list,
         ObservationTypeGenerationLike,
+        get_observation_types_list,
     )
 
     langfuse = Langfuse()

From e2d08ae48e66a7c8d35b19a7c25efca08a1d268f Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Mon, 15 Sep 2025 17:11:06 +0200
Subject: [PATCH 14/25] push

---
 langfuse/__init__.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/langfuse/__init__.py b/langfuse/__init__.py
index 3449e851f..b2cfa96f6 100644
--- a/langfuse/__init__.py
+++ b/langfuse/__init__.py
@@ -6,16 +6,16 @@
 from ._client.get_client import get_client
 from ._client.observe import observe
 from ._client.span import (
-    LangfuseEvent,
-    LangfuseGeneration,
-    LangfuseSpan,
     LangfuseAgent,
-    LangfuseTool,
     LangfuseChain,
     LangfuseEmbedding,
     LangfuseEvaluator,
-    LangfuseRetriever,
+    LangfuseEvent,
+    LangfuseGeneration,
     LangfuseGuardrail,
+    LangfuseRetriever,
+    LangfuseSpan,
+    LangfuseTool,
 )
 
 Langfuse = _client_module.Langfuse
@@ -36,4 +36,5 @@
     "LangfuseEvaluator",
     "LangfuseRetriever",
     "LangfuseGuardrail",
+    "experiment",
 ]

From 07b17b9074f99f79f39b2611c0704c6ee8917ea6 Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Mon, 15 Sep 2025 17:40:37 +0200
Subject: [PATCH 15/25] push

---
 langfuse/__init__.py   | 1 +
 langfuse/experiment.py | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/langfuse/__init__.py b/langfuse/__init__.py
index b2cfa96f6..049d922cd 100644
--- a/langfuse/__init__.py
+++ b/langfuse/__init__.py
@@ -37,4 +37,5 @@
     "LangfuseRetriever",
     "LangfuseGuardrail",
     "experiment",
+    "api",
 ]
diff --git a/langfuse/experiment.py b/langfuse/experiment.py
index 249c71d2d..5427f06d9 100644
--- a/langfuse/experiment.py
+++ b/langfuse/experiment.py
@@ -19,6 +19,8 @@
     Union,
 )
 
+from langfuse.api import ScoreDataType
+
 if TYPE_CHECKING:
     from langfuse._client.datasets import DatasetItemClient
 
@@ -113,6 +115,8 @@ class Evaluation(TypedDict, total=False):
         metadata: Optional structured metadata about the evaluation process.
             Can include confidence scores, intermediate calculations, model versions,
             or any other relevant technical details.
+        data_type: Optional score data type; one of NUMERIC,CATEGORICAL, or BOOLEAN; default: NUMERIC
+        config_id: Optional Langfuse score config id
 
     Examples:
         Quantitative accuracy evaluation:
@@ -159,6 +163,8 @@ class Evaluation(TypedDict, total=False):
     value: Union[int, float, str, bool, None]
     comment: Optional[str]
     metadata: Optional[Dict[str, Any]]
+    data_type: Optional[ScoreDataType]
+    config_id: Optional[str]
 
 
 class ExperimentItemResult(TypedDict):

From b01cbd082a9f28d9d175c0d9b99083c3cde7b4cc Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Mon, 15 Sep 2025 17:46:05 +0200
Subject: [PATCH 16/25] push

---
 langfuse/_client/client.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py
index 514e00084..b5479b115 100644
--- a/langfuse/_client/client.py
+++ b/langfuse/_client/client.py
@@ -82,6 +82,7 @@
     Prompt_Text,
 )
 from langfuse.experiment import (
+    Evaluation,
     EvaluatorFunction,
     ExperimentData,
     ExperimentItem,
@@ -2674,7 +2675,7 @@ async def process_item(item: ExperimentItem) -> dict:
                 valid_results.append(result)  # type: ignore
 
         # Run experiment-level evaluators
-        run_evaluations = []
+        run_evaluations: List[Evaluation] = []
         for run_evaluator in run_evaluators:
             try:
                 evaluations = await _run_evaluator(
@@ -2713,10 +2714,11 @@ async def process_item(item: ExperimentItem) -> dict:
                 if dataset_run_id:
                     self.create_score(
                         dataset_run_id=dataset_run_id,
-                        name=evaluation["name"],
-                        value=evaluation["value"],  # type: ignore
+                        name=evaluation.get("name") or "<unknown>",
+                        value=evaluation.get("value"),  # type: ignore
                         comment=evaluation.get("comment"),
                         metadata=evaluation.get("metadata"),
+                        data_type=evaluation.get("data_type"),  # type: ignore
                     )
 
             except Exception as e:

From cbfcdd43078c002268563a662560709b277b189b Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Tue, 16 Sep 2025 11:05:37 +0200
Subject: [PATCH 17/25] push

---
 langfuse/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/langfuse/__init__.py b/langfuse/__init__.py
index 049d922cd..23f6ac143 100644
--- a/langfuse/__init__.py
+++ b/langfuse/__init__.py
@@ -38,4 +38,5 @@
     "LangfuseGuardrail",
     "experiment",
     "api",
+    "async_api",
 ]

From 009c191c34be2f39b08d8bd8835dfbc0d306896e Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Tue, 16 Sep 2025 11:05:56 +0200
Subject: [PATCH 18/25] push

---
 langfuse/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/langfuse/__init__.py b/langfuse/__init__.py
index 23f6ac143..049d922cd 100644
--- a/langfuse/__init__.py
+++ b/langfuse/__init__.py
@@ -38,5 +38,4 @@
     "LangfuseGuardrail",
     "experiment",
     "api",
-    "async_api",
 ]

From e4a459946dcd581f731f35ec2892bf25d678418a Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Tue, 16 Sep 2025 18:25:03 +0200
Subject: [PATCH 19/25] move to classes

---
 langfuse/_client/client.py   |  76 ++---
 langfuse/_client/datasets.py |  15 +-
 langfuse/experiment.py       | 629 ++++++++++++++++++++++-------------
 tests/test_experiments.py    | 168 +++++-----
 4 files changed, 527 insertions(+), 361 deletions(-)

diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py
index b5479b115..5dac439af 100644
--- a/langfuse/_client/client.py
+++ b/langfuse/_client/client.py
@@ -2543,11 +2543,11 @@ def length_evaluator(*, input, output, expected_output=None, **kwargs):
                 evaluators=[length_evaluator]
             )
 
-            print(f"Processed {len(result['item_results'])} items")
-            for item_result in result["item_results"]:
-                print(f"Input: {item_result['item']['input']}")
-                print(f"Output: {item_result['output']}")
-                print(f"Evaluations: {item_result['evaluations']}")
+            print(f"Processed {len(result.item_results)} items")
+            for item_result in result.item_results:
+                print(f"Input: {item_result.item['input']}")
+                print(f"Output: {item_result.output}")
+                print(f"Evaluations: {item_result.evaluations}")
             ```
 
             Advanced experiment with async task and multiple evaluators:
@@ -2576,9 +2576,9 @@ def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
 
             def average_accuracy(*, item_results, **kwargs):
                 accuracies = [
-                    eval["value"] for result in item_results
-                    for eval in result["evaluations"]
-                    if eval["name"] == "accuracy"
+                    eval.value for result in item_results
+                    for eval in result.evaluations
+                    if eval.name == "accuracy"
                 ]
                 return {
                     "name": "average_accuracy",
@@ -2656,7 +2656,7 @@ async def _run_experiment_async(
         semaphore = asyncio.Semaphore(max_concurrency)
 
         # Process all items
-        async def process_item(item: ExperimentItem) -> dict:
+        async def process_item(item: ExperimentItem) -> ExperimentItemResult:
             async with semaphore:
                 return await self._process_experiment_item(
                     item, task, evaluators, name, description, metadata
@@ -2671,7 +2671,7 @@ async def process_item(item: ExperimentItem) -> dict:
         for i, result in enumerate(item_results):
             if isinstance(result, Exception):
                 langfuse_logger.error(f"Item {i} failed: {result}")
-            elif isinstance(result, dict):
+            elif isinstance(result, ExperimentItemResult):
                 valid_results.append(result)  # type: ignore
 
         # Run experiment-level evaluators
@@ -2686,9 +2686,7 @@ async def process_item(item: ExperimentItem) -> dict:
                 langfuse_logger.error(f"Run evaluator failed: {e}")
 
         # Generate dataset run URL if applicable
-        dataset_run_id = (
-            valid_results[0].get("dataset_run_id") if valid_results else None
-        )
+        dataset_run_id = valid_results[0].dataset_run_id if valid_results else None
         dataset_run_url = None
         if dataset_run_id and data:
             try:
@@ -2714,11 +2712,11 @@ async def process_item(item: ExperimentItem) -> dict:
                 if dataset_run_id:
                     self.create_score(
                         dataset_run_id=dataset_run_id,
-                        name=evaluation.get("name") or "<unknown>",
-                        value=evaluation.get("value"),  # type: ignore
-                        comment=evaluation.get("comment"),
-                        metadata=evaluation.get("metadata"),
-                        data_type=evaluation.get("data_type"),  # type: ignore
+                        name=evaluation.name or "<unknown>",
+                        value=evaluation.value,  # type: ignore
+                        comment=evaluation.comment,
+                        metadata=evaluation.metadata,
+                        data_type=evaluation.data_type,  # type: ignore
                     )
 
             except Exception as e:
@@ -2727,14 +2725,14 @@ async def process_item(item: ExperimentItem) -> dict:
         # Flush scores and traces
         self.flush()
 
-        return {
-            "name": name,
-            "description": description,
-            "item_results": valid_results,
-            "run_evaluations": run_evaluations,
-            "dataset_run_id": dataset_run_id,
-            "dataset_run_url": dataset_run_url,
-        }
+        return ExperimentResult(
+            name=name,
+            description=description,
+            item_results=valid_results,
+            run_evaluations=run_evaluations,
+            dataset_run_id=dataset_run_id,
+            dataset_run_url=dataset_run_url,
+        )
 
     async def _process_experiment_item(
         self,
@@ -2744,7 +2742,7 @@ async def _process_experiment_item(
         experiment_name: str,
         experiment_description: Optional[str],
         experiment_metadata: Dict[str, Any],
-    ) -> dict:
+    ) -> ExperimentItemResult:
         # Execute task with tracing
         span_name = "experiment-item-run"
 
@@ -2842,22 +2840,24 @@ async def _process_experiment_item(
                         for evaluation in eval_results:
                             self.create_score(
                                 trace_id=trace_id,
-                                name=evaluation.get("name", "unknown"),
-                                value=evaluation.get("value", -1),  # type: ignore
-                                comment=evaluation.get("comment"),
-                                metadata=evaluation.get("metadata"),
+                                name=evaluation.name or "unknown",
+                                value=evaluation.value
+                                if evaluation.value is not None
+                                else -1,  # type: ignore
+                                comment=evaluation.comment,
+                                metadata=evaluation.metadata,
                             )
 
                     except Exception as e:
                         langfuse_logger.error(f"Evaluator failed: {e}")
 
-                return {
-                    "item": item,
-                    "output": output,
-                    "evaluations": evaluations,
-                    "trace_id": trace_id,
-                    "dataset_run_id": dataset_run_id,
-                }
+                return ExperimentItemResult(
+                    item=item,
+                    output=output,
+                    evaluations=evaluations,
+                    trace_id=trace_id,
+                    dataset_run_id=dataset_run_id,
+                )
 
             except Exception as e:
                 span.update(
diff --git a/langfuse/_client/datasets.py b/langfuse/_client/datasets.py
index f62c8b0f1..29754a8ce 100644
--- a/langfuse/_client/datasets.py
+++ b/langfuse/_client/datasets.py
@@ -237,12 +237,21 @@ def run_experiment(
                 Will be combined with individual item metadata.
 
         Returns:
-            ExperimentResult dictionary containing:
+            ExperimentResult object containing:
+            - name: The experiment name
+            - description: Optional experiment description
             - item_results: Results for each dataset item with outputs and evaluations
             - run_evaluations: Aggregate evaluation results for the entire run
             - dataset_run_id: ID of the created dataset run in Langfuse
             - dataset_run_url: Direct URL to view the experiment results in Langfuse UI
 
+            The result object provides a format() method for human-readable output:
+            ```python
+            result = dataset.run_experiment(...)
+            print(result.format())  # Summary view
+            print(result.format(include_item_results=True))  # Detailed view
+            ```
+
         Raises:
             ValueError: If the dataset has no items or no Langfuse client is available
 
@@ -372,8 +381,8 @@ def content_diversity(*, item_results, **kwargs):
 
             # Both experiments are now visible in Langfuse for easy comparison
             print("Compare results in Langfuse:")
-            print(f"GPT-4: {result_gpt4['dataset_run_url']}")
-            print(f"Custom: {result_custom['dataset_run_url']}")
+            print(f"GPT-4: {result_gpt4.dataset_run_url}")
+            print(f"Custom: {result_custom.dataset_run_url}")
             ```
 
         Note:
diff --git a/langfuse/experiment.py b/langfuse/experiment.py
index 5427f06d9..74926cd31 100644
--- a/langfuse/experiment.py
+++ b/langfuse/experiment.py
@@ -7,6 +7,7 @@
 
 import asyncio
 import logging
+from dataclasses import dataclass
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -93,12 +94,13 @@ class LocalExperimentItem(TypedDict, total=False):
 """
 
 
-class Evaluation(TypedDict, total=False):
-    """Structure for evaluation results returned by evaluator functions.
+@dataclass(frozen=True)
+class Evaluation:
+    """Represents an evaluation result for an experiment item.
 
-    This TypedDict defines the standardized format that all evaluator functions
-    must return. It provides a consistent structure for storing evaluation metrics
-    and their metadata across different types of evaluators.
+    This class provides a strongly-typed way to create evaluation results in evaluator functions.
+    Users should import this class and return instances instead of dictionaries for better
+    type safety and IDE support.
 
     Attributes:
         name: Unique identifier for the evaluation metric. Should be descriptive
@@ -115,67 +117,128 @@ class Evaluation(TypedDict, total=False):
         metadata: Optional structured metadata about the evaluation process.
             Can include confidence scores, intermediate calculations, model versions,
             or any other relevant technical details.
-        data_type: Optional score data type; one of NUMERIC,CATEGORICAL, or BOOLEAN; default: NUMERIC
+        data_type: Optional score data type; one of NUMERIC, CATEGORICAL, or BOOLEAN; default: NUMERIC
         config_id: Optional Langfuse score config id
 
     Examples:
-        Quantitative accuracy evaluation:
+        Basic accuracy evaluation:
         ```python
-        accuracy_result: Evaluation = {
-            "name": "accuracy",
-            "value": 0.85,
-            "comment": "85% of responses were correct",
-            "metadata": {"total_items": 100, "correct_items": 85}
-        }
+        from langfuse import Evaluation
+
+        def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
+            if not expected_output:
+                return Evaluation(name="accuracy", value=None, comment="No expected output")
+
+            is_correct = output.strip().lower() == expected_output.strip().lower()
+            return Evaluation(
+                name="accuracy",
+                value=1.0 if is_correct else 0.0,
+                comment="Correct answer" if is_correct else "Incorrect answer"
+            )
         ```
 
-        Qualitative assessment:
+        Multi-metric evaluator:
         ```python
-        sentiment_result: Evaluation = {
-            "name": "sentiment",
-            "value": "positive",
-            "comment": "Response expresses optimistic viewpoint",
-            "metadata": {"confidence": 0.92, "model": "sentiment-analyzer-v2"}
-        }
+        def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
+            return [
+                Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
+                Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
+                Evaluation(
+                    name="quality",
+                    value=0.85,
+                    comment="High quality response",
+                    metadata={"confidence": 0.92, "model": "gpt-4"}
+                )
+            ]
         ```
 
-        Binary check:
+        Categorical evaluation:
         ```python
-        safety_result: Evaluation = {
-            "name": "safety_check",
-            "value": True,
-            "comment": "Content passes all safety filters"
-        }
+        def sentiment_evaluator(*, input, output, **kwargs):
+            sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
+            return Evaluation(
+                name="sentiment",
+                value=sentiment,
+                comment=f"Response expresses {sentiment} sentiment",
+                data_type="CATEGORICAL"
+            )
         ```
 
-        Failed evaluation:
+        Failed evaluation with error handling:
         ```python
-        failed_result: Evaluation = {
-            "name": "external_api_score",
-            "value": None,
-            "comment": "External API unavailable",
-            "metadata": {"error": "timeout", "retry_count": 3}
-        }
+        def external_api_evaluator(*, input, output, **kwargs):
+            try:
+                score = external_api.evaluate(output)
+                return Evaluation(name="external_score", value=score)
+            except Exception as e:
+                return Evaluation(
+                    name="external_score",
+                    value=None,
+                    comment=f"API unavailable: {e}",
+                    metadata={"error": str(e), "retry_count": 3}
+                )
         ```
+
+    Note:
+        This class is immutable (frozen=True) to ensure evaluation results cannot be
+        accidentally modified after creation. All fields except name and value are optional.
     """
 
     name: str
     value: Union[int, float, str, bool, None]
-    comment: Optional[str]
-    metadata: Optional[Dict[str, Any]]
-    data_type: Optional[ScoreDataType]
-    config_id: Optional[str]
+    comment: Optional[str] = None
+    metadata: Optional[Dict[str, Any]] = None
+    data_type: Optional[ScoreDataType] = None
+    config_id: Optional[str] = None
 
 
-class ExperimentItemResult(TypedDict):
+@dataclass(frozen=True)
+class ExperimentItemResult:
     """Result structure for individual experiment items.
 
-    Args:
-        item: The original experiment item that was processed
-        output: The actual output produced by the task
-        evaluations: List of evaluation results for this item
-        trace_id: Langfuse trace ID for this item's execution
-        dataset_run_id: Dataset run ID if this item was part of a Langfuse dataset
+    This dataclass represents the complete result of processing a single item
+    during an experiment run, including the original input, task output,
+    evaluations, and tracing information.
+
+    Attributes:
+        item: The original experiment item that was processed. Can be either
+            a dictionary with 'input', 'expected_output', and 'metadata' keys,
+            or a DatasetItemClient from Langfuse datasets.
+        output: The actual output produced by the task function for this item.
+            Can be any type depending on what your task function returns.
+        evaluations: List of evaluation results for this item. Each evaluation
+            contains a name, value, optional comment, and optional metadata.
+        trace_id: Optional Langfuse trace ID for this item's execution. Used
+            to link the experiment result with the detailed trace in Langfuse UI.
+        dataset_run_id: Optional dataset run ID if this item was part of a
+            Langfuse dataset. None for local experiments.
+
+    Examples:
+        Accessing item result data:
+        ```python
+        result = langfuse.run_experiment(...)
+        for item_result in result.item_results:
+            print(f"Input: {item_result.item}")
+            print(f"Output: {item_result.output}")
+            print(f"Trace: {item_result.trace_id}")
+
+            # Access evaluations
+            for evaluation in item_result.evaluations:
+                print(f"{evaluation.name}: {evaluation.value}")
+        ```
+
+        Working with different item types:
+        ```python
+        # Local experiment item (dict)
+        if isinstance(item_result.item, dict):
+            input_data = item_result.item["input"]
+            expected = item_result.item.get("expected_output")
+
+        # Langfuse dataset item (object with attributes)
+        else:
+            input_data = item_result.item.input
+            expected = item_result.item.expected_output
+        ```
     """
 
     item: ExperimentItem
@@ -185,22 +248,291 @@ class ExperimentItemResult(TypedDict):
     dataset_run_id: Optional[str]
 
 
-class ExperimentResult(TypedDict):
+class ExperimentResult:
     """Complete result structure for experiment execution.
 
-    Args:
-        item_results: Results from processing each individual data item
-        run_evaluations: Results from run-level evaluators
-        dataset_run_id: ID of the dataset run (if using Langfuse datasets)
-        dataset_run_url: URL to view the dataset run in Langfuse UI
+    This class encapsulates the complete results of running an experiment on a dataset,
+    including individual item results, aggregate run-level evaluations, and metadata
+    about the experiment execution.
+
+    Attributes:
+        name: The name of the experiment as specified during execution
+        description: Optional description of the experiment's purpose or methodology
+        item_results: List of results from processing each individual dataset item,
+            containing the original item, task output, evaluations, and trace information
+        run_evaluations: List of aggregate evaluation results computed across all items,
+            such as average scores, statistical summaries, or cross-item analyses
+        dataset_run_id: Optional ID of the dataset run in Langfuse (when using Langfuse datasets)
+        dataset_run_url: Optional direct URL to view the experiment results in Langfuse UI
+
+    Examples:
+        Basic usage with local dataset:
+        ```python
+        result = langfuse.run_experiment(
+            name="Capital Cities Test",
+            data=local_data,
+            task=generate_capital,
+            evaluators=[accuracy_check]
+        )
+
+        print(f"Processed {len(result.item_results)} items")
+        print(result.format())  # Human-readable summary
+
+        # Access individual results
+        for item_result in result.item_results:
+            print(f"Input: {item_result.item}")
+            print(f"Output: {item_result.output}")
+            print(f"Scores: {item_result.evaluations}")
+        ```
+
+        Usage with Langfuse datasets:
+        ```python
+        dataset = langfuse.get_dataset("qa-eval-set")
+        result = dataset.run_experiment(
+            name="GPT-4 QA Evaluation",
+            task=answer_question,
+            evaluators=[relevance_check, accuracy_check]
+        )
+
+        # View in Langfuse UI
+        if result.dataset_run_url:
+            print(f"View detailed results: {result.dataset_run_url}")
+        ```
+
+        Formatted output:
+        ```python
+        # Get summary view
+        summary = result.format()
+        print(summary)
+
+        # Get detailed view with individual items
+        detailed = result.format(include_item_results=True)
+        with open("experiment_report.txt", "w") as f:
+            f.write(detailed)
+        ```
     """
 
-    name: str
-    description: Optional[str]
-    item_results: List[ExperimentItemResult]
-    run_evaluations: List[Evaluation]
-    dataset_run_id: Optional[str]
-    dataset_run_url: Optional[str]
+    def __init__(
+        self,
+        name: str,
+        description: Optional[str],
+        item_results: List[ExperimentItemResult],
+        run_evaluations: List[Evaluation],
+        dataset_run_id: Optional[str] = None,
+        dataset_run_url: Optional[str] = None,
+    ):
+        """Initialize an ExperimentResult with the provided data.
+
+        Args:
+            name: The name of the experiment
+            description: Optional description of the experiment
+            item_results: List of results from processing individual dataset items
+            run_evaluations: List of aggregate evaluation results for the entire run
+            dataset_run_id: Optional ID of the dataset run (for Langfuse datasets)
+            dataset_run_url: Optional URL to view results in Langfuse UI
+        """
+        self.name = name
+        self.description = description
+        self.item_results = item_results
+        self.run_evaluations = run_evaluations
+        self.dataset_run_id = dataset_run_id
+        self.dataset_run_url = dataset_run_url
+
+    def format(self, *, include_item_results: bool = False) -> str:
+        r"""Format the experiment result for human-readable display.
+
+        Converts the experiment result into a nicely formatted string suitable for
+        console output, logging, or reporting. The output includes experiment overview,
+        aggregate statistics, and optionally individual item details.
+
+        This method provides a comprehensive view of experiment performance including:
+        - Experiment metadata (name, description, item count)
+        - List of evaluation metrics used across items
+        - Average scores computed across all processed items
+        - Run-level evaluation results (aggregate metrics)
+        - Links to view detailed results in Langfuse UI (when available)
+        - Individual item details (when requested)
+
+        Args:
+            include_item_results: Whether to include detailed results for each individual
+                item in the formatted output. When False (default), only shows aggregate
+                statistics and summary information. When True, includes input/output/scores
+                for every processed item, making the output significantly longer but more
+                detailed for debugging and analysis purposes.
+
+        Returns:
+            A formatted multi-line string containing:
+            - Experiment name and description (if provided)
+            - Total number of items successfully processed
+            - List of all evaluation metrics that were applied
+            - Average scores across all items for each numeric metric
+            - Run-level evaluation results with comments
+            - Dataset run URL for viewing in Langfuse UI (if applicable)
+            - Individual item details including inputs, outputs, and scores (if requested)
+
+        Examples:
+            Basic usage showing aggregate results only:
+            ```python
+            result = langfuse.run_experiment(
+                name="Capital Cities",
+                data=dataset,
+                task=generate_capital,
+                evaluators=[accuracy_evaluator]
+            )
+
+            print(result.format())
+            # Output:
+            # ──────────────────────────────────────────────────
+            # 📊 Capital Cities
+            # 100 items
+            # Evaluations:
+            #   • accuracy
+            # Average Scores:
+            #   • accuracy: 0.850
+            ```
+
+            Detailed output including all individual item results:
+            ```python
+            detailed_report = result.format(include_item_results=True)
+            print(detailed_report)
+            # Output includes each item:
+            # 1. Item 1:
+            #    Input:    What is the capital of France?
+            #    Expected: Paris
+            #    Actual:   The capital of France is Paris.
+            #    Scores:
+            #      • accuracy: 1.000
+            #        💭 Correct answer found
+            # [... continues for all items ...]
+            ```
+
+            Saving formatted results to file for reporting:
+            ```python
+            with open("experiment_report.txt", "w") as f:
+                f.write(result.format(include_item_results=True))
+
+            # Or create summary report
+            summary = result.format()  # Aggregate view only
+            print(f"Experiment Summary:\\n{summary}")
+            ```
+
+            Integration with logging systems:
+            ```python
+            import logging
+            logger = logging.getLogger("experiments")
+
+            # Log summary after experiment
+            logger.info(f"Experiment completed:\\n{result.format()}")
+
+            # Log detailed results for failed experiments
+            if any(eval['value'] < threshold for eval in result.run_evaluations):
+                logger.warning(f"Poor performance detected:\\n{result.format(include_item_results=True)}")
+            ```
+        """
+        if not self.item_results:
+            return "No experiment results to display."
+
+        output = ""
+
+        # Individual results section
+        if include_item_results:
+            for i, result in enumerate(self.item_results):
+                output += f"\\n{i + 1}. Item {i + 1}:\\n"
+
+                # Extract and display input
+                item_input = None
+                if isinstance(result.item, dict):
+                    item_input = result.item.get("input")
+                elif hasattr(result.item, "input"):
+                    item_input = result.item.input
+
+                if item_input is not None:
+                    output += f"   Input:    {_format_value(item_input)}\\n"
+
+                # Extract and display expected output
+                expected_output = None
+                if isinstance(result.item, dict):
+                    expected_output = result.item.get("expected_output")
+                elif hasattr(result.item, "expected_output"):
+                    expected_output = result.item.expected_output
+
+                if expected_output is not None:
+                    output += f"   Expected: {_format_value(expected_output)}\\n"
+                output += f"   Actual:   {_format_value(result.output)}\\n"
+
+                # Display evaluation scores
+                if result.evaluations:
+                    output += "   Scores:\\n"
+                    for evaluation in result.evaluations:
+                        score = evaluation.value
+                        if isinstance(score, (int, float)):
+                            score = f"{score:.3f}"
+                        output += f"     • {evaluation.name}: {score}"
+                        if evaluation.comment:
+                            output += f"\\n       💭 {evaluation.comment}"
+                        output += "\\n"
+
+                # Display trace link if available
+                if result.trace_id:
+                    output += f"\\n   Trace ID: {result.trace_id}\\n"
+        else:
+            output += f"Individual Results: Hidden ({len(self.item_results)} items)\\n"
+            output += "💡 Set include_item_results=True to view them\\n"
+
+        # Experiment overview section
+        output += f"\\n{'─' * 50}\\n"
+        output += f"📊 {self.name}"
+        if self.description:
+            output += f" - {self.description}"
+
+        output += f"\\n{len(self.item_results)} items"
+
+        # Collect unique evaluation names across all items
+        evaluation_names = set()
+        for result in self.item_results:
+            for evaluation in result.evaluations:
+                evaluation_names.add(evaluation.name)
+
+        if evaluation_names:
+            output += "\\nEvaluations:"
+            for eval_name in evaluation_names:
+                output += f"\\n  • {eval_name}"
+            output += "\\n"
+
+        # Calculate and display average scores
+        if evaluation_names:
+            output += "\\nAverage Scores:"
+            for eval_name in evaluation_names:
+                scores = []
+                for result in self.item_results:
+                    for evaluation in result.evaluations:
+                        if evaluation.name == eval_name and isinstance(
+                            evaluation.value, (int, float)
+                        ):
+                            scores.append(evaluation.value)
+
+                if scores:
+                    avg = sum(scores) / len(scores)
+                    output += f"\\n  • {eval_name}: {avg:.3f}"
+            output += "\\n"
+
+        # Display run-level evaluations
+        if self.run_evaluations:
+            output += "\\nRun Evaluations:"
+            for run_eval in self.run_evaluations:
+                score = run_eval.value
+                if isinstance(score, (int, float)):
+                    score = f"{score:.3f}"
+                output += f"\\n  • {run_eval.name}: {score}"
+                if run_eval.comment:
+                    output += f"\\n    💭 {run_eval.comment}"
+            output += "\\n"
+
+        # Add dataset run URL if available
+        if self.dataset_run_url:
+            output += f"\\n🔗 Dataset Run:\\n   {self.dataset_run_url}"
+
+        return output
 
 
 class TaskFunction(Protocol):
@@ -303,7 +635,7 @@ def __call__(
     ) -> Union[
         Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
     ]:
-        """Evaluate a task output for quality, correctness, or other metrics.
+        r"""Evaluate a task output for quality, correctness, or other metrics.
 
         This method should implement specific evaluation logic such as accuracy checking,
         similarity measurement, toxicity detection, fluency assessment, etc.
@@ -440,7 +772,7 @@ def __call__(
     ) -> Union[
         Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
     ]:
-        """Evaluate the entire experiment run with aggregate metrics.
+        r"""Evaluate the entire experiment run with aggregate metrics.
 
         This method should implement aggregate evaluation logic such as computing
         averages, calculating distributions, finding correlations, detecting patterns
@@ -480,9 +812,9 @@ def average_accuracy(*, item_results, **kwargs):
 
                 accuracy_values = []
                 for result in item_results:
-                    for evaluation in result["evaluations"]:
-                        if evaluation["name"] == "accuracy":
-                            accuracy_values.append(evaluation["value"])
+                    for evaluation in result.evaluations:
+                        if evaluation.name == "accuracy":
+                            accuracy_values.append(evaluation.value)
 
                 if not accuracy_values:
                     return {"name": "avg_accuracy", "value": None, "comment": "No accuracy evaluations found"}
@@ -504,7 +836,7 @@ def statistical_summary(*, item_results, **kwargs):
                 results = []
 
                 # Calculate output length statistics
-                lengths = [len(str(result["output"])) for result in item_results]
+                lengths = [len(str(result.output)) for result in item_results]
                 results.extend([
                     {"name": "avg_output_length", "value": sum(lengths) / len(lengths)},
                     {"name": "min_output_length", "value": min(lengths)},
@@ -526,7 +858,7 @@ def statistical_summary(*, item_results, **kwargs):
             ```python
             async def llm_batch_analysis(*, item_results, **kwargs):
                 # Prepare batch analysis prompt
-                outputs = [result["output"] for result in item_results]
+                outputs = [result.output for result in item_results]
                 prompt = f"Analyze these {len(outputs)} outputs for common themes:\n"
                 prompt += "\n".join(f"{i+1}. {output}" for i, output in enumerate(outputs))
 
@@ -550,9 +882,9 @@ def performance_distribution(*, item_results, **kwargs):
                 score_by_metric = {}
 
                 for result in item_results:
-                    for evaluation in result["evaluations"]:
-                        metric_name = evaluation["name"]
-                        value = evaluation["value"]
+                    for evaluation in result.evaluations:
+                        metric_name = evaluation.name
+                        value = evaluation.value
 
                         if isinstance(value, (int, float)):
                             all_scores.append(value)
@@ -586,167 +918,6 @@ def performance_distribution(*, item_results, **kwargs):
         ...
 
 
-def format_experiment_result(
-    experiment_result: ExperimentResult,
-    *,
-    include_item_results: bool = False,
-) -> str:
-    """Format an experiment result for human-readable display.
-
-    Takes an ExperimentResult object and converts it into a nicely formatted
-    string suitable for console output or logging. The output includes experiment
-    overview, aggregate statistics, and optionally individual item details.
-
-    Args:
-        experiment_result: Complete experiment result containing name, description,
-            item results, run evaluations, and dataset run information.
-        include_item_results: Whether to include detailed results for each individual
-            item in the output. When False (default), only shows aggregate statistics.
-            Set to True to see input/output/scores for every processed item.
-
-    Returns:
-        A formatted multi-line string containing:
-        - Experiment name and description
-        - Number of items processed
-        - List of evaluation metrics used
-        - Average scores across all items
-        - Run-level evaluation results
-        - Dataset run URL (if available)
-        - Individual item details (if include_item_results=True)
-
-    Examples:
-        Basic usage with aggregate results only:
-        ```python
-        result = langfuse.run_experiment(...)
-        print(format_experiment_result(result))
-        ```
-
-        Detailed output including individual items:
-        ```python
-        result = langfuse.run_experiment(...)
-        detailed_report = format_experiment_result(
-            result,
-            include_item_results=True
-        )
-        print(detailed_report)
-        ```
-
-        Save formatted results to file:
-        ```python
-        result = dataset.run_experiment(...)
-        with open("experiment_report.txt", "w") as f:
-            f.write(format_experiment_result(result, include_item_results=True))
-        ```
-    """
-    item_results = experiment_result["item_results"]
-    run_evaluations = experiment_result["run_evaluations"]
-    dataset_run_url = experiment_result["dataset_run_url"]
-
-    if not item_results:
-        return "No experiment results to display."
-
-    output = ""
-
-    # Individual results
-    if include_item_results:
-        for i, result in enumerate(item_results):
-            output += f"\n{i + 1}. Item {i + 1}:\n"
-
-            # Input, expected, and actual
-            item_input = None
-            if isinstance(result["item"], dict):
-                item_input = result["item"].get("input")
-            elif hasattr(result["item"], "input"):
-                item_input = result["item"].input
-
-            if item_input is not None:
-                output += f"   Input:    {_format_value(item_input)}\n"
-
-            expected_output = None
-            if isinstance(result["item"], dict):
-                expected_output = result["item"].get("expected_output")
-            elif hasattr(result["item"], "expected_output"):
-                expected_output = result["item"].expected_output
-
-            if expected_output is not None:
-                output += f"   Expected: {_format_value(expected_output)}\n"
-            output += f"   Actual:   {_format_value(result['output'])}\n"
-
-            # Scores
-            if result["evaluations"]:
-                output += "   Scores:\n"
-                for evaluation in result["evaluations"]:
-                    score = evaluation["value"]
-                    if isinstance(score, (int, float)):
-                        score = f"{score:.3f}"
-                    output += f"     • {evaluation['name']}: {score}"
-                    if evaluation.get("comment"):
-                        output += f"\n       💭 {evaluation['comment']}"
-                    output += "\n"
-
-            # Trace link
-            if result.get("trace_id"):
-                # Note: We'd need the langfuse client to generate the actual URL
-                output += f"\n   Trace ID: {result['trace_id']}\n"
-    else:
-        output += f"Individual Results: Hidden ({len(item_results)} items)\n"
-        output += "💡 Set include_item_results=True to view them\n"
-
-    # Experiment Overview
-    output += f"\n{'─' * 50}\n"
-    output += f"📊 {experiment_result['name']}"
-    if experiment_result["description"]:
-        output += f" - {experiment_result['description']}"
-
-    output += f"\n{len(item_results)} items"
-
-    # Get unique evaluation names
-    evaluation_names = set()
-    for result in item_results:
-        for evaluation in result["evaluations"]:
-            evaluation_names.add(evaluation["name"])
-
-    if evaluation_names:
-        output += "\nEvaluations:"
-        for eval_name in evaluation_names:
-            output += f"\n  • {eval_name}"
-        output += "\n"
-
-    # Average scores
-    if evaluation_names:
-        output += "\nAverage Scores:"
-        for eval_name in evaluation_names:
-            scores = []
-            for result in item_results:
-                for evaluation in result["evaluations"]:
-                    if evaluation["name"] == eval_name and isinstance(
-                        evaluation["value"], (int, float)
-                    ):
-                        scores.append(evaluation["value"])
-
-            if scores:
-                avg = sum(scores) / len(scores)
-                output += f"\n  • {eval_name}: {avg:.3f}"
-        output += "\n"
-
-    # Run evaluations
-    if run_evaluations:
-        output += "\nRun Evaluations:"
-        for run_eval in run_evaluations:
-            score = run_eval["value"]
-            if isinstance(score, (int, float)):
-                score = f"{score:.3f}"
-            output += f"\n  • {run_eval['name']}: {score}"
-            if run_eval.get("comment"):
-                output += f"\n    💭 {run_eval['comment']}"
-        output += "\n"
-
-    if dataset_run_url:
-        output += f"\n🔗 Dataset Run:\n   {dataset_run_url}"
-
-    return output
-
-
 def _format_value(value: Any) -> str:
     """Format a value for display."""
     if isinstance(value, str):
@@ -766,7 +937,7 @@ async def _run_evaluator(
             result = await result
 
         # Normalize to list
-        if isinstance(result, dict):
+        if isinstance(result, (dict, Evaluation)):
             return [result]
 
         elif isinstance(result, list):
@@ -811,7 +982,7 @@ def langfuse_evaluator(
         output: Any,
         expected_output: Any,
         metadata: Optional[Dict[str, Any]],
-        **kwargs: Dict[str, Any],
+        **langfuse_kwargs: Dict[str, Any],
     ) -> Evaluation:
         evaluation = autoevals_evaluator(
             input=input, output=output, expected=expected_output, **kwargs
diff --git a/tests/test_experiments.py b/tests/test_experiments.py
index c278243ab..d6ec67369 100644
--- a/tests/test_experiments.py
+++ b/tests/test_experiments.py
@@ -6,7 +6,7 @@
 import pytest
 
 from langfuse import get_client
-from langfuse._client.experiments import (
+from langfuse.experiment import (
     Evaluation,
     ExperimentData,
     ExperimentItem,
@@ -37,29 +37,25 @@ def mock_task(*, item: ExperimentItem, **kwargs: Dict[str, Any]):
 
 def simple_evaluator(*, input, output, expected_output=None, **kwargs):
     """Return output length."""
-    return Evaluation(**{"name": "length_check", "value": len(output)})
+    return Evaluation(name="length_check", value=len(output))
 
 
 def factuality_evaluator(*, input, output, expected_output=None, **kwargs):
     """Mock factuality evaluator."""
     # Simple mock: check if expected output is in the output
     if expected_output and expected_output.lower() in output.lower():
-        return Evaluation(
-            **{"name": "factuality", "value": 1.0, "comment": "Correct answer found"}
-        )
-    return Evaluation(
-        **{"name": "factuality", "value": 0.0, "comment": "Incorrect answer"}
-    )
+        return Evaluation(name="factuality", value=1.0, comment="Correct answer found")
+    return Evaluation(name="factuality", value=0.0, comment="Incorrect answer")
 
 
 def run_evaluator_average_length(*, item_results: List[ExperimentItemResult], **kwargs):
     """Run evaluator that calculates average output length."""
     if not item_results:
-        return Evaluation(**{"name": "average_length", "value": 0})
+        return Evaluation(name="average_length", value=0)
 
-    avg_length = sum(len(r["output"]) for r in item_results) / len(item_results)
+    avg_length = sum(len(r.output) for r in item_results) / len(item_results)
 
-    return Evaluation(**{"name": "average_length", "value": avg_length})
+    return Evaluation(name="average_length", value=avg_length)
 
 
 # Basic Functionality Tests
@@ -77,20 +73,20 @@ def test_run_experiment_on_local_dataset(sample_dataset):
     )
 
     # Validate basic result structure
-    assert len(result["item_results"]) == 3
-    assert len(result["run_evaluations"]) == 1
-    assert result["run_evaluations"][0]["name"] == "average_length"
-    assert result["dataset_run_id"] is None  # No dataset_run_id for local datasets
+    assert len(result.item_results) == 3
+    assert len(result.run_evaluations) == 1
+    assert result.run_evaluations[0].name == "average_length"
+    assert result.dataset_run_id is None  # No dataset_run_id for local datasets
 
     # Validate item results structure
-    for item_result in result["item_results"]:
-        assert "output" in item_result
-        assert "evaluations" in item_result
-        assert "trace_id" in item_result
+    for item_result in result.item_results:
+        assert hasattr(item_result, "output")
+        assert hasattr(item_result, "evaluations")
+        assert hasattr(item_result, "trace_id")
         assert (
-            item_result["dataset_run_id"] is None
+            item_result.dataset_run_id is None
         )  # No dataset_run_id for local datasets
-        assert len(item_result["evaluations"]) == 2  # Both evaluators should run
+        assert len(item_result.evaluations) == 2  # Both evaluators should run
 
     # Flush and wait for server processing
     langfuse_client.flush()
@@ -101,8 +97,8 @@ def test_run_experiment_on_local_dataset(sample_dataset):
     expected_inputs = ["Germany", "France", "Spain"]
     expected_outputs = ["Capital of Germany", "Capital of France", "Capital of Spain"]
 
-    for i, item_result in enumerate(result["item_results"]):
-        trace_id = item_result["trace_id"]
+    for i, item_result in enumerate(result.item_results):
+        trace_id = item_result.trace_id
         assert trace_id is not None, f"Item {i} should have a trace_id"
 
         # Fetch trace from API
@@ -173,9 +169,9 @@ def test_run_experiment_on_langfuse_dataset():
     )
 
     # Should have dataset run ID for Langfuse datasets
-    assert result["dataset_run_id"] is not None
-    assert len(result["item_results"]) == 2
-    assert all(item["dataset_run_id"] is not None for item in result["item_results"])
+    assert result.dataset_run_id is not None
+    assert len(result.item_results) == 2
+    assert all(item.dataset_run_id is not None for item in result.item_results)
 
     # Flush and wait for server processing
     langfuse_client.flush()
@@ -188,13 +184,13 @@ def test_run_experiment_on_langfuse_dataset():
 
     # Validate traces are correctly persisted with input/output/metadata
     expected_data = {"Germany": "Capital of Germany", "France": "Capital of France"}
-    dataset_run_id = result["dataset_run_id"]
+    dataset_run_id = result.dataset_run_id
 
     # Create a mapping from dataset item ID to dataset item for validation
     dataset_item_map = {item.id: item for item in dataset.items}
 
-    for i, item_result in enumerate(result["item_results"]):
-        trace_id = item_result["trace_id"]
+    for i, item_result in enumerate(result.item_results):
+        trace_id = item_result.trace_id
         assert trace_id is not None, f"Item {i} should have a trace_id"
 
         # Fetch trace from API
@@ -283,7 +279,7 @@ def test_run_experiment_on_langfuse_dataset():
     run_item_trace_ids = {
         item.trace_id for item in dataset_run_items.data if item.trace_id
     }
-    result_trace_ids = {item["trace_id"] for item in result["item_results"]}
+    result_trace_ids = {item.trace_id for item in result.item_results}
 
     assert run_item_trace_ids == result_trace_ids, (
         f"Dataset run items should link to the same traces as experiment results. "
@@ -300,7 +296,7 @@ def failing_evaluator(**kwargs):
         raise Exception("Evaluator failed")
 
     def working_evaluator(**kwargs):
-        return Evaluation(**{"name": "working_eval", "value": 1.0})
+        return Evaluation(name="working_eval", value=1.0)
 
     result = langfuse_client.run_experiment(
         name="Error test",
@@ -310,14 +306,14 @@ def working_evaluator(**kwargs):
     )
 
     # Should complete with only working evaluator
-    assert len(result["item_results"]) == 1
+    assert len(result.item_results) == 1
     # Only the working evaluator should have produced results
     assert (
         len(
             [
                 eval
-                for eval in result["item_results"][0]["evaluations"]
-                if eval["name"] == "working_eval"
+                for eval in result.item_results[0].evaluations
+                if eval.name == "working_eval"
             ]
         )
         == 1
@@ -345,7 +341,7 @@ def working_task(item):
     )
 
     # Should complete but with no valid results since all tasks failed
-    assert len(result["item_results"]) == 0
+    assert len(result.item_results) == 0
 
     langfuse_client.flush()
     time.sleep(1)
@@ -366,8 +362,8 @@ def failing_run_evaluator(**kwargs):
     )
 
     # Should complete but run evaluations should be empty
-    assert len(result["item_results"]) == 1
-    assert len(result["run_evaluations"]) == 0
+    assert len(result.item_results) == 1
+    assert len(result.run_evaluations) == 0
 
     langfuse_client.flush()
     time.sleep(1)
@@ -385,8 +381,8 @@ def test_empty_dataset_handling():
         run_evaluators=[run_evaluator_average_length],
     )
 
-    assert len(result["item_results"]) == 0
-    assert len(result["run_evaluations"]) == 1  # Run evaluators still execute
+    assert len(result.item_results) == 0
+    assert len(result.run_evaluations) == 1  # Run evaluators still execute
 
     langfuse_client.flush()
     time.sleep(1)
@@ -409,10 +405,10 @@ def test_dataset_with_missing_fields():
     )
 
     # Should handle missing fields gracefully
-    assert len(result["item_results"]) == 3
-    for item_result in result["item_results"]:
-        assert "trace_id" in item_result
-        assert "output" in item_result
+    assert len(result.item_results) == 3
+    for item_result in result.item_results:
+        assert hasattr(item_result, "trace_id")
+        assert hasattr(item_result, "output")
 
     langfuse_client.flush()
     time.sleep(1)
@@ -430,14 +426,14 @@ def test_large_dataset_with_concurrency():
         name="Large dataset test",
         data=large_dataset,
         task=lambda **kwargs: f"Processed {kwargs['item']}",
-        evaluators=[lambda **kwargs: {"name": "simple_eval", "value": 1.0}],
+        evaluators=[lambda **kwargs: Evaluation(name="simple_eval", value=1.0)],
         max_concurrency=5,
     )
 
-    assert len(result["item_results"]) == 20
-    for item_result in result["item_results"]:
-        assert len(item_result["evaluations"]) == 1
-        assert "trace_id" in item_result
+    assert len(result.item_results) == 20
+    for item_result in result.item_results:
+        assert len(item_result.evaluations) == 1
+        assert hasattr(item_result, "trace_id")
 
     langfuse_client.flush()
     time.sleep(3)
@@ -449,9 +445,7 @@ def test_single_evaluation_return():
     langfuse_client = get_client()
 
     def single_evaluator(**kwargs):
-        return Evaluation(
-            **{"name": "single_eval", "value": 1, "comment": "Single evaluation"}
-        )
+        return Evaluation(name="single_eval", value=1, comment="Single evaluation")
 
     result = langfuse_client.run_experiment(
         name="Single evaluation test",
@@ -460,9 +454,9 @@ def single_evaluator(**kwargs):
         evaluators=[single_evaluator],
     )
 
-    assert len(result["item_results"]) == 1
-    assert len(result["item_results"][0]["evaluations"]) == 1
-    assert result["item_results"][0]["evaluations"][0]["name"] == "single_eval"
+    assert len(result.item_results) == 1
+    assert len(result.item_results[0].evaluations) == 1
+    assert result.item_results[0].evaluations[0].name == "single_eval"
 
     langfuse_client.flush()
     time.sleep(1)
@@ -478,9 +472,9 @@ def test_no_evaluators():
         task=lambda **kwargs: "result",
     )
 
-    assert len(result["item_results"]) == 1
-    assert len(result["item_results"][0]["evaluations"]) == 0
-    assert len(result["run_evaluations"]) == 0
+    assert len(result.item_results) == 1
+    assert len(result.item_results[0].evaluations) == 0
+    assert len(result.run_evaluations) == 0
 
     langfuse_client.flush()
     time.sleep(1)
@@ -492,11 +486,7 @@ def test_only_run_evaluators():
 
     def run_only_evaluator(**kwargs):
         return Evaluation(
-            **{
-                "name": "run_only_eval",
-                "value": 10,
-                "comment": "Run-level evaluation",
-            }
+            name="run_only_eval", value=10, comment="Run-level evaluation"
         )
 
     result = langfuse_client.run_experiment(
@@ -506,10 +496,10 @@ def run_only_evaluator(**kwargs):
         run_evaluators=[run_only_evaluator],
     )
 
-    assert len(result["item_results"]) == 1
-    assert len(result["item_results"][0]["evaluations"]) == 0  # No item evaluations
-    assert len(result["run_evaluations"]) == 1
-    assert result["run_evaluations"][0]["name"] == "run_only_eval"
+    assert len(result.item_results) == 1
+    assert len(result.item_results[0].evaluations) == 0  # No item evaluations
+    assert len(result.run_evaluations) == 1
+    assert result.run_evaluations[0].name == "run_only_eval"
 
     langfuse_client.flush()
     time.sleep(1)
@@ -520,13 +510,13 @@ def test_different_data_types():
     langfuse_client = get_client()
 
     def number_evaluator(**kwargs):
-        return Evaluation(**{"name": "number_eval", "value": 42})
+        return Evaluation(name="number_eval", value=42)
 
     def string_evaluator(**kwargs):
-        return Evaluation(**{"name": "string_eval", "value": "excellent"})
+        return Evaluation(name="string_eval", value="excellent")
 
     def boolean_evaluator(**kwargs):
-        return Evaluation(**{"name": "boolean_eval", "value": True})
+        return Evaluation(name="boolean_eval", value=True)
 
     result = langfuse_client.run_experiment(
         name="Different data types test",
@@ -535,10 +525,10 @@ def boolean_evaluator(**kwargs):
         evaluators=[number_evaluator, string_evaluator, boolean_evaluator],
     )
 
-    evaluations = result["item_results"][0]["evaluations"]
+    evaluations = result.item_results[0].evaluations
     assert len(evaluations) == 3
 
-    eval_by_name = {e["name"]: e["value"] for e in evaluations}
+    eval_by_name = {e.name: e.value for e in evaluations}
     assert eval_by_name["number_eval"] == 42
     assert eval_by_name["string_eval"] == "excellent"
     assert eval_by_name["boolean_eval"] is True
@@ -566,20 +556,16 @@ def test_scores_are_persisted():
 
     def test_evaluator(**kwargs):
         return Evaluation(
-            **{
-                "name": "persistence_test",
-                "value": 0.85,
-                "comment": "Test evaluation for persistence",
-            }
+            name="persistence_test",
+            value=0.85,
+            comment="Test evaluation for persistence",
         )
 
     def test_run_evaluator(**kwargs):
         return Evaluation(
-            **{
-                "name": "persistence_run_test",
-                "value": 0.9,
-                "comment": "Test run evaluation for persistence",
-            }
+            name="persistence_run_test",
+            value=0.9,
+            comment="Test run evaluation for persistence",
         )
 
     result = dataset.run_experiment(
@@ -590,9 +576,9 @@ def test_run_evaluator(**kwargs):
         run_evaluators=[test_run_evaluator],
     )
 
-    assert result["dataset_run_id"] is not None
-    assert len(result["item_results"]) == 1
-    assert len(result["run_evaluations"]) == 1
+    assert result.dataset_run_id is not None
+    assert len(result.item_results) == 1
+    assert len(result.run_evaluations) == 1
 
     langfuse_client.flush()
     time.sleep(3)
@@ -650,9 +636,9 @@ def test_multiple_experiments_on_same_dataset():
     time.sleep(2)
 
     # Both experiments should have different run IDs
-    assert result1["dataset_run_id"] is not None
-    assert result2["dataset_run_id"] is not None
-    assert result1["dataset_run_id"] != result2["dataset_run_id"]
+    assert result1.dataset_run_id is not None
+    assert result2.dataset_run_id is not None
+    assert result1.dataset_run_id != result2.dataset_run_id
 
     # Verify both runs exist in database
     api = get_api()
@@ -679,10 +665,10 @@ def test_format_experiment_results_basic():
     )
 
     # Basic validation that result structure is correct for formatting
-    assert len(result["item_results"]) == 1
-    assert len(result["run_evaluations"]) == 1
-    assert "trace_id" in result["item_results"][0]
-    assert "evaluations" in result["item_results"][0]
+    assert len(result.item_results) == 1
+    assert len(result.run_evaluations) == 1
+    assert hasattr(result.item_results[0], "trace_id")
+    assert hasattr(result.item_results[0], "evaluations")
 
     langfuse_client.flush()
     time.sleep(1)

From fbe54976dc2d9a850adf2ff5a81a4e72617d7ef4 Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Tue, 16 Sep 2025 18:58:16 +0200
Subject: [PATCH 20/25] move to classes

---
 langfuse/__init__.py       |  3 ++
 langfuse/_client/client.py |  6 +--
 langfuse/experiment.py     | 91 ++++++++++++++++++++++++++++----------
 3 files changed, 73 insertions(+), 27 deletions(-)

diff --git a/langfuse/__init__.py b/langfuse/__init__.py
index 049d922cd..b2b73b54b 100644
--- a/langfuse/__init__.py
+++ b/langfuse/__init__.py
@@ -1,5 +1,7 @@
 """.. include:: ../README.md"""
 
+from langfuse.experiment import Evaluation
+
 from ._client import client as _client_module
 from ._client.attributes import LangfuseOtelSpanAttributes
 from ._client.constants import ObservationTypeLiteral
@@ -36,6 +38,7 @@
     "LangfuseEvaluator",
     "LangfuseRetriever",
     "LangfuseGuardrail",
+    "Evaluation",
     "experiment",
     "api",
 ]
diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py
index 5dac439af..3c7558465 100644
--- a/langfuse/_client/client.py
+++ b/langfuse/_client/client.py
@@ -2840,10 +2840,8 @@ async def _process_experiment_item(
                         for evaluation in eval_results:
                             self.create_score(
                                 trace_id=trace_id,
-                                name=evaluation.name or "unknown",
-                                value=evaluation.value
-                                if evaluation.value is not None
-                                else -1,  # type: ignore
+                                name=evaluation.name,
+                                value=evaluation.value or -1,
                                 comment=evaluation.comment,
                                 metadata=evaluation.metadata,
                             )
diff --git a/langfuse/experiment.py b/langfuse/experiment.py
index 74926cd31..62a15ac23 100644
--- a/langfuse/experiment.py
+++ b/langfuse/experiment.py
@@ -7,7 +7,6 @@
 
 import asyncio
 import logging
-from dataclasses import dataclass
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -94,13 +93,11 @@ class LocalExperimentItem(TypedDict, total=False):
 """
 
 
-@dataclass(frozen=True)
 class Evaluation:
-    """Represents an evaluation result for an experiment item.
+    """Represents an evaluation result for an experiment item or an entire experiment run.
 
     This class provides a strongly-typed way to create evaluation results in evaluator functions.
-    Users should import this class and return instances instead of dictionaries for better
-    type safety and IDE support.
+    Users must use keyword arguments when instantiating this class.
 
     Attributes:
         name: Unique identifier for the evaluation metric. Should be descriptive
@@ -117,7 +114,7 @@ class Evaluation:
         metadata: Optional structured metadata about the evaluation process.
             Can include confidence scores, intermediate calculations, model versions,
             or any other relevant technical details.
-        data_type: Optional score data type; one of NUMERIC, CATEGORICAL, or BOOLEAN; default: NUMERIC
+        data_type: Optional score data type, required if value is not NUMERIC; one of NUMERIC, CATEGORICAL, or BOOLEAN; default: NUMERIC
         config_id: Optional Langfuse score config id
 
     Examples:
@@ -180,25 +177,47 @@ def external_api_evaluator(*, input, output, **kwargs):
         ```
 
     Note:
-        This class is immutable (frozen=True) to ensure evaluation results cannot be
-        accidentally modified after creation. All fields except name and value are optional.
+        All arguments must be passed as keywords. Positional arguments are not allowed
+        to ensure code clarity and prevent errors from argument reordering.
     """
 
-    name: str
-    value: Union[int, float, str, bool, None]
-    comment: Optional[str] = None
-    metadata: Optional[Dict[str, Any]] = None
-    data_type: Optional[ScoreDataType] = None
-    config_id: Optional[str] = None
+    def __init__(
+        self,
+        *,
+        name: str,
+        value: Union[int, float, str, bool, None],
+        comment: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        data_type: Optional[ScoreDataType] = None,
+        config_id: Optional[str] = None,
+    ):
+        """Initialize an Evaluation with the provided data.
+
+        Args:
+            name: Unique identifier for the evaluation metric
+            value: The evaluation score or result
+            comment: Optional human-readable explanation of the result
+            metadata: Optional structured metadata about the evaluation process
+            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN)
+            config_id: Optional Langfuse score config id
+
+        Note:
+            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
+        """
+        self.name = name
+        self.value = value
+        self.comment = comment
+        self.metadata = metadata
+        self.data_type = data_type
+        self.config_id = config_id
 
 
-@dataclass(frozen=True)
 class ExperimentItemResult:
     """Result structure for individual experiment items.
 
-    This dataclass represents the complete result of processing a single item
+    This class represents the complete result of processing a single item
     during an experiment run, including the original input, task output,
-    evaluations, and tracing information.
+    evaluations, and tracing information. Users must use keyword arguments when instantiating this class.
 
     Attributes:
         item: The original experiment item that was processed. Can be either
@@ -239,13 +258,38 @@ class ExperimentItemResult:
             input_data = item_result.item.input
             expected = item_result.item.expected_output
         ```
+
+    Note:
+        All arguments must be passed as keywords. Positional arguments are not allowed
+        to ensure code clarity and prevent errors from argument reordering.
     """
 
-    item: ExperimentItem
-    output: Any
-    evaluations: List[Evaluation]
-    trace_id: Optional[str]
-    dataset_run_id: Optional[str]
+    def __init__(
+        self,
+        *,
+        item: ExperimentItem,
+        output: Any,
+        evaluations: List[Evaluation],
+        trace_id: Optional[str],
+        dataset_run_id: Optional[str],
+    ):
+        """Initialize an ExperimentItemResult with the provided data.
+
+        Args:
+            item: The original experiment item that was processed
+            output: The actual output produced by the task function for this item
+            evaluations: List of evaluation results for this item
+            trace_id: Optional Langfuse trace ID for this item's execution
+            dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset
+
+        Note:
+            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
+        """
+        self.item = item
+        self.output = output
+        self.evaluations = evaluations
+        self.trace_id = trace_id
+        self.dataset_run_id = dataset_run_id
 
 
 class ExperimentResult:
@@ -314,6 +358,7 @@ class ExperimentResult:
 
     def __init__(
         self,
+        *,
         name: str,
         description: Optional[str],
         item_results: List[ExperimentItemResult],
@@ -938,7 +983,7 @@ async def _run_evaluator(
 
         # Normalize to list
         if isinstance(result, (dict, Evaluation)):
-            return [result]
+            return [result]  # type: ignore
 
         elif isinstance(result, list):
             return result

From 36ca2c20015974deeb2597928aaacd0a2bad1b04 Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Tue, 16 Sep 2025 19:00:36 +0200
Subject: [PATCH 21/25] add comment metadata

---
 langfuse/_client/client.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py
index 3c7558465..ccafd9bd2 100644
--- a/langfuse/_client/client.py
+++ b/langfuse/_client/client.py
@@ -2507,6 +2507,7 @@ def run_experiment(
                 API rate limits and system resources.
             metadata: Optional metadata dictionary to attach to all experiment traces.
                 This metadata will be included in every trace created during the experiment.
+                If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too.
 
         Returns:
             ExperimentResult dictionary containing:

From 32cbe0255e8c9698f280bb92708a71147581d93c Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Tue, 16 Sep 2025 23:05:13 +0200
Subject: [PATCH 22/25] add run_name

---
 langfuse/_client/client.py   | 31 ++++++++++++++++---
 langfuse/_client/datasets.py | 23 +++++++++-----
 langfuse/_client/span.py     | 26 ++++++++--------
 langfuse/experiment.py       | 58 ++++++++++++++++++++----------------
 tests/test_experiments.py    | 28 ++++++++---------
 5 files changed, 99 insertions(+), 67 deletions(-)

diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py
index 9e324b0a3..86085ebbc 100644
--- a/langfuse/_client/client.py
+++ b/langfuse/_client/client.py
@@ -2463,6 +2463,7 @@ def run_experiment(
         self,
         *,
         name: str,
+        run_name: Optional[str] = None,
         description: Optional[str] = None,
         data: ExperimentData,
         task: TaskFunction,
@@ -2487,7 +2488,10 @@ def run_experiment(
 
         Args:
             name: Human-readable name for the experiment. Used for identification
-                in the Langfuse UI and for dataset run naming if using Langfuse datasets.
+                in the Langfuse UI.
+            run_name: Optional exact name for the experiment run. If provided, this will be
+                used as the exact dataset run name if the `data` contains Langfuse dataset items.
+                If not provided, this will default to the experiment name appended with an ISO timestamp.
             description: Optional description explaining the experiment's purpose,
                 methodology, or expected outcomes.
             data: Array of data items to process. Can be either:
@@ -2628,6 +2632,9 @@ def average_accuracy(*, item_results, **kwargs):
             run_async_safely(
                 self._run_experiment_async(
                     name=name,
+                    run_name=self._create_experiment_run_name(
+                        name=name, run_name=run_name
+                    ),
                     description=description,
                     data=data,
                     task=task,
@@ -2643,6 +2650,7 @@ async def _run_experiment_async(
         self,
         *,
         name: str,
+        run_name: str,
         description: Optional[str],
         data: ExperimentData,
         task: TaskFunction,
@@ -2651,7 +2659,9 @@ async def _run_experiment_async(
         max_concurrency: int,
         metadata: Dict[str, Any],
     ) -> ExperimentResult:
-        langfuse_logger.debug(f"Starting experiment '{name}' with {len(data)} items")
+        langfuse_logger.debug(
+            f"Starting experiment '{name}' run '{run_name}' with {len(data)} items"
+        )
 
         # Set up concurrency control
         semaphore = asyncio.Semaphore(max_concurrency)
@@ -2660,7 +2670,7 @@ async def _run_experiment_async(
         async def process_item(item: ExperimentItem) -> ExperimentItemResult:
             async with semaphore:
                 return await self._process_experiment_item(
-                    item, task, evaluators, name, description, metadata
+                    item, task, evaluators, name, run_name, description, metadata
                 )
 
         # Run all items concurrently
@@ -2728,6 +2738,7 @@ async def process_item(item: ExperimentItem) -> ExperimentItemResult:
 
         return ExperimentResult(
             name=name,
+            run_name=run_name,
             description=description,
             item_results=valid_results,
             run_evaluations=run_evaluations,
@@ -2741,6 +2752,7 @@ async def _process_experiment_item(
         task: Callable,
         evaluators: List[Callable],
         experiment_name: str,
+        experiment_run_name: str,
         experiment_description: Optional[str],
         experiment_metadata: Dict[str, Any],
     ) -> ExperimentItemResult:
@@ -2764,6 +2776,7 @@ async def _process_experiment_item(
 
                 final_metadata = {
                     "experiment_name": experiment_name,
+                    "experiment_run_name": experiment_run_name,
                     **experiment_metadata,
                 }
 
@@ -2796,7 +2809,7 @@ async def _process_experiment_item(
 
                         dataset_run_item = self.api.dataset_run_items.create(
                             request=CreateDatasetRunItemRequest(
-                                runName=experiment_name,
+                                runName=experiment_run_name,
                                 runDescription=experiment_description,
                                 metadata=experiment_metadata,
                                 datasetItemId=item.id,  # type: ignore
@@ -2864,6 +2877,16 @@ async def _process_experiment_item(
                 )
                 raise e
 
+    def _create_experiment_run_name(
+        self, *, name: Optional[str] = None, run_name: Optional[str] = None
+    ) -> str:
+        if run_name:
+            return run_name
+
+        iso_timestamp = _get_timestamp().isoformat().replace("+00:00", "Z")
+
+        return f"{name} - {iso_timestamp}"
+
     def auth_check(self) -> bool:
         """Check if the provided credentials (public and secret key) are valid.
 
diff --git a/langfuse/_client/datasets.py b/langfuse/_client/datasets.py
index 29754a8ce..beb1248ba 100644
--- a/langfuse/_client/datasets.py
+++ b/langfuse/_client/datasets.py
@@ -6,6 +6,7 @@
 
 from langfuse.experiment import (
     EvaluatorFunction,
+    ExperimentResult,
     RunEvaluatorFunction,
     TaskFunction,
 )
@@ -199,13 +200,14 @@ def run_experiment(
         self,
         *,
         name: str,
+        run_name: Optional[str] = None,
         description: Optional[str] = None,
         task: TaskFunction,
         evaluators: List[EvaluatorFunction] = [],
         run_evaluators: List[RunEvaluatorFunction] = [],
         max_concurrency: int = 50,
         metadata: Optional[Dict[str, Any]] = None,
-    ) -> Any:
+    ) -> ExperimentResult:
         """Run an experiment on this Langfuse dataset with automatic tracking.
 
         This is a convenience method that runs an experiment using all items in this
@@ -222,6 +224,9 @@ def run_experiment(
         Args:
             name: Human-readable name for the experiment run. This will be used as
                 the dataset run name in Langfuse for tracking and identification.
+            run_name: Optional exact name for the dataset run. If provided, this will be
+                used as the exact dataset run name in Langfuse. If not provided, this will
+                default to the experiment name appended with an ISO timestamp.
             description: Optional description of the experiment's purpose, methodology,
                 or what you're testing. Appears in the Langfuse UI for context.
             task: Function that processes each dataset item and returns output.
@@ -238,12 +243,13 @@ def run_experiment(
 
         Returns:
             ExperimentResult object containing:
-            - name: The experiment name
-            - description: Optional experiment description
-            - item_results: Results for each dataset item with outputs and evaluations
-            - run_evaluations: Aggregate evaluation results for the entire run
-            - dataset_run_id: ID of the created dataset run in Langfuse
-            - dataset_run_url: Direct URL to view the experiment results in Langfuse UI
+            - name: The experiment name.
+            - run_name: The experiment run name (equivalent to the dataset run name).
+            - description: Optional experiment description.
+            - item_results: Results for each dataset item with outputs and evaluations.
+            - run_evaluations: Aggregate evaluation results for the entire run.
+            - dataset_run_id: ID of the created dataset run in Langfuse.
+            - dataset_run_url: Direct URL to view the experiment results in Langfuse UI.
 
             The result object provides a format() method for human-readable output:
             ```python
@@ -253,7 +259,7 @@ def run_experiment(
             ```
 
         Raises:
-            ValueError: If the dataset has no items or no Langfuse client is available
+            ValueError: If the dataset has no items or no Langfuse client is available.
 
         Examples:
             Basic dataset experiment:
@@ -400,6 +406,7 @@ def content_diversity(*, item_results, **kwargs):
 
         return langfuse_client.run_experiment(
             name=name,
+            run_name=run_name,
             description=description,
             data=self.items,
             task=task,
diff --git a/langfuse/_client/span.py b/langfuse/_client/span.py
index 68c1e8c63..9fa9c7489 100644
--- a/langfuse/_client/span.py
+++ b/langfuse/_client/span.py
@@ -1468,19 +1468,19 @@ def start_as_current_generation(
         return self.start_as_current_observation(
             name=name,
             as_type="generation",
-                input=input,
-                output=output,
-                metadata=metadata,
-                version=version,
-                level=level,
-                status_message=status_message,
-                completion_start_time=completion_start_time,
-                model=model,
-                model_parameters=model_parameters,
-                usage_details=usage_details,
-                cost_details=cost_details,
-                prompt=prompt,
-            )
+            input=input,
+            output=output,
+            metadata=metadata,
+            version=version,
+            level=level,
+            status_message=status_message,
+            completion_start_time=completion_start_time,
+            model=model,
+            model_parameters=model_parameters,
+            usage_details=usage_details,
+            cost_details=cost_details,
+            prompt=prompt,
+        )
 
     def create_event(
         self,
diff --git a/langfuse/experiment.py b/langfuse/experiment.py
index 62a15ac23..f4c913c37 100644
--- a/langfuse/experiment.py
+++ b/langfuse/experiment.py
@@ -114,8 +114,9 @@ class Evaluation:
         metadata: Optional structured metadata about the evaluation process.
             Can include confidence scores, intermediate calculations, model versions,
             or any other relevant technical details.
-        data_type: Optional score data type, required if value is not NUMERIC; one of NUMERIC, CATEGORICAL, or BOOLEAN; default: NUMERIC
-        config_id: Optional Langfuse score config id
+        data_type: Optional score data type. Required if value is not NUMERIC.
+            One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
+        config_id: Optional Langfuse score config ID.
 
     Examples:
         Basic accuracy evaluation:
@@ -194,12 +195,12 @@ def __init__(
         """Initialize an Evaluation with the provided data.
 
         Args:
-            name: Unique identifier for the evaluation metric
-            value: The evaluation score or result
-            comment: Optional human-readable explanation of the result
-            metadata: Optional structured metadata about the evaluation process
-            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN)
-            config_id: Optional Langfuse score config id
+            name: Unique identifier for the evaluation metric.
+            value: The evaluation score or result.
+            comment: Optional human-readable explanation of the result.
+            metadata: Optional structured metadata about the evaluation process.
+            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
+            config_id: Optional Langfuse score config ID.
 
         Note:
             All arguments must be provided as keywords. Positional arguments will raise a TypeError.
@@ -276,11 +277,11 @@ def __init__(
         """Initialize an ExperimentItemResult with the provided data.
 
         Args:
-            item: The original experiment item that was processed
-            output: The actual output produced by the task function for this item
-            evaluations: List of evaluation results for this item
-            trace_id: Optional Langfuse trace ID for this item's execution
-            dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset
+            item: The original experiment item that was processed.
+            output: The actual output produced by the task function for this item.
+            evaluations: List of evaluation results for this item.
+            trace_id: Optional Langfuse trace ID for this item's execution.
+            dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset.
 
         Note:
             All arguments must be provided as keywords. Positional arguments will raise a TypeError.
@@ -300,14 +301,15 @@ class ExperimentResult:
     about the experiment execution.
 
     Attributes:
-        name: The name of the experiment as specified during execution
-        description: Optional description of the experiment's purpose or methodology
+        name: The name of the experiment as specified during execution.
+        run_name: The name of the current experiment run.
+        description: Optional description of the experiment's purpose or methodology.
         item_results: List of results from processing each individual dataset item,
-            containing the original item, task output, evaluations, and trace information
+            containing the original item, task output, evaluations, and trace information.
         run_evaluations: List of aggregate evaluation results computed across all items,
-            such as average scores, statistical summaries, or cross-item analyses
-        dataset_run_id: Optional ID of the dataset run in Langfuse (when using Langfuse datasets)
-        dataset_run_url: Optional direct URL to view the experiment results in Langfuse UI
+            such as average scores, statistical summaries, or cross-item analyses.
+        dataset_run_id: Optional ID of the dataset run in Langfuse (when using Langfuse datasets).
+        dataset_run_url: Optional direct URL to view the experiment results in Langfuse UI.
 
     Examples:
         Basic usage with local dataset:
@@ -360,6 +362,7 @@ def __init__(
         self,
         *,
         name: str,
+        run_name: str,
         description: Optional[str],
         item_results: List[ExperimentItemResult],
         run_evaluations: List[Evaluation],
@@ -369,14 +372,16 @@ def __init__(
         """Initialize an ExperimentResult with the provided data.
 
         Args:
-            name: The name of the experiment
-            description: Optional description of the experiment
-            item_results: List of results from processing individual dataset items
-            run_evaluations: List of aggregate evaluation results for the entire run
-            dataset_run_id: Optional ID of the dataset run (for Langfuse datasets)
-            dataset_run_url: Optional URL to view results in Langfuse UI
+            name: The name of the experiment.
+            run_name: The current experiment run name.
+            description: Optional description of the experiment.
+            item_results: List of results from processing individual dataset items.
+            run_evaluations: List of aggregate evaluation results for the entire run.
+            dataset_run_id: Optional ID of the dataset run (for Langfuse datasets).
+            dataset_run_url: Optional URL to view results in Langfuse UI.
         """
         self.name = name
+        self.run_name = run_name
         self.description = description
         self.item_results = item_results
         self.run_evaluations = run_evaluations
@@ -526,7 +531,8 @@ def format(self, *, include_item_results: bool = False) -> str:
 
         # Experiment overview section
         output += f"\\n{'─' * 50}\\n"
-        output += f"📊 {self.name}"
+        output += f"🧪 Experiment: {self.name}"
+        output += f"\n📋 Run name: {self.run_name}"
         if self.description:
             output += f" - {self.description}"
 
diff --git a/tests/test_experiments.py b/tests/test_experiments.py
index d6ec67369..168310970 100644
--- a/tests/test_experiments.py
+++ b/tests/test_experiments.py
@@ -179,8 +179,9 @@ def test_run_experiment_on_langfuse_dataset():
 
     # Verify dataset run exists via API
     api = get_api()
-    runs = api.datasets.get_runs(dataset_name)
-    assert len(runs.data) >= 1
+    dataset_run = api.datasets.get_run(
+        dataset_name=dataset_name, run_name=result.run_name
+    )
 
     # Validate traces are correctly persisted with input/output/metadata
     expected_data = {"Germany": "Capital of Germany", "France": "Capital of France"}
@@ -256,22 +257,15 @@ def test_run_experiment_on_langfuse_dataset():
             dataset_item.input == matching_input
         ), f"Trace {trace_id} should correspond to dataset item with input '{matching_input}'"
 
-    # Verify dataset run contains the correct trace IDs
-    dataset_run = None
-    for run in runs.data:
-        if run.id == dataset_run_id:
-            dataset_run = run
-            break
-
     assert dataset_run is not None, f"Dataset run {dataset_run_id} should exist"
-    assert dataset_run.name == experiment_name, "Dataset run should have correct name"
+    assert dataset_run.name == result.run_name, "Dataset run should have correct name"
     assert (
         dataset_run.description == "Test on Langfuse dataset"
     ), "Dataset run should have correct description"
 
     # Get dataset run items to verify trace linkage
     dataset_run_items = api.dataset_run_items.list(
-        dataset_id=dataset.id, run_name=experiment_name
+        dataset_id=dataset.id, run_name=result.run_name
     )
     assert len(dataset_run_items.data) == 2, "Dataset run should have 2 items"
 
@@ -570,6 +564,7 @@ def test_run_evaluator(**kwargs):
 
     result = dataset.run_experiment(
         name="Score persistence test",
+        run_name="Score persistence test",
         description="Test score persistence",
         task=mock_task,
         evaluators=[test_evaluator],
@@ -585,12 +580,11 @@ def test_run_evaluator(**kwargs):
 
     # Verify scores are persisted via API
     api = get_api()
-    runs = api.datasets.get_runs(dataset_name)
-    assert len(runs.data) >= 1
+    dataset_run = api.datasets.get_run(
+        dataset_name=dataset_name, run_name=result.run_name
+    )
 
-    # Verify the run exists with correct name
-    run_names = [run.name for run in runs.data]
-    assert "Score persistence test" in run_names
+    assert dataset_run.name == "Score persistence test"
 
 
 def test_multiple_experiments_on_same_dataset():
@@ -616,6 +610,7 @@ def test_multiple_experiments_on_same_dataset():
     # Run first experiment
     result1 = dataset.run_experiment(
         name="Experiment 1",
+        run_name="Experiment 1",
         description="First experiment",
         task=mock_task,
         evaluators=[factuality_evaluator],
@@ -627,6 +622,7 @@ def test_multiple_experiments_on_same_dataset():
     # Run second experiment
     result2 = dataset.run_experiment(
         name="Experiment 2",
+        run_name="Experiment 2",
         description="Second experiment",
         task=mock_task,
         evaluators=[simple_evaluator],

From 469166b4dd9d8a7b7a7ab0ed93c5524defb589e4 Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Tue, 16 Sep 2025 23:17:55 +0200
Subject: [PATCH 23/25] push

---
 tests/test_openai.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_openai.py b/tests/test_openai.py
index 623802e55..056e4597d 100644
--- a/tests/test_openai.py
+++ b/tests/test_openai.py
@@ -94,7 +94,7 @@ def test_openai_chat_completion_stream(openai):
     assert len(chat_content) > 0
 
     langfuse.flush()
-    sleep(1)
+    sleep(3)
 
     generation = get_api().observations.get_many(
         name=generation_name, type="GENERATION"

From 1c9f01208650517d221a348a90826d7e67d17c0f Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Wed, 17 Sep 2025 09:52:13 +0200
Subject: [PATCH 24/25] add docstring

---
 langfuse/_client/client.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py
index 86085ebbc..c5941af89 100644
--- a/langfuse/_client/client.py
+++ b/langfuse/_client/client.py
@@ -2514,7 +2514,8 @@ def run_experiment(
                 If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too.
 
         Returns:
-            ExperimentResult dictionary containing:
+            ExperimentResult containing:
+            - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
             - item_results: List of results for each processed item with outputs and evaluations
             - run_evaluations: List of aggregate evaluation results for the entire run
             - dataset_run_id: ID of the dataset run (if using Langfuse datasets)

From 9e7cac693f8d900c8b0c68bda46a1bac9fce6ac7 Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Wed, 17 Sep 2025 11:07:28 +0200
Subject: [PATCH 25/25] add observationid to link calls

---
 langfuse/_client/client.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py
index c5941af89..ceb29c5d3 100644
--- a/langfuse/_client/client.py
+++ b/langfuse/_client/client.py
@@ -101,6 +101,7 @@
     ChatPromptClient,
     CreateDatasetItemRequest,
     CreateDatasetRequest,
+    CreateDatasetRunItemRequest,
     Dataset,
     DatasetItem,
     DatasetStatus,
@@ -2806,8 +2807,6 @@ async def _process_experiment_item(
                 # Link to dataset run if this is a dataset item
                 if hasattr(item, "id") and hasattr(item, "dataset_id"):
                     try:
-                        from langfuse.model import CreateDatasetRunItemRequest
-
                         dataset_run_item = self.api.dataset_run_items.create(
                             request=CreateDatasetRunItemRequest(
                                 runName=experiment_run_name,
@@ -2815,6 +2814,7 @@ async def _process_experiment_item(
                                 metadata=experiment_metadata,
                                 datasetItemId=item.id,  # type: ignore
                                 traceId=trace_id,
+                                observationId=span.id,
                             )
                         )