From 7a2232a4d2e65996e22f47d3016c7ec4a358b173 Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Thu, 11 Sep 2025 11:24:26 +0200 Subject: [PATCH 01/25] feat(experiments): add experiment runner --- langfuse/_client/client.py | 288 +++++++++++++++++++++++++++++++++++ langfuse/_client/datasets.py | 71 ++++++++- 2 files changed, 358 insertions(+), 1 deletion(-) diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py index df243e51c..7bfe2ac52 100644 --- a/langfuse/_client/client.py +++ b/langfuse/_client/client.py @@ -3,6 +3,7 @@ This module implements Langfuse's core observability functionality on top of the OpenTelemetry (OTel) standard. """ +import asyncio import logging import os import re @@ -13,6 +14,7 @@ from time import time_ns from typing import ( Any, + Callable, Dict, List, Literal, @@ -44,6 +46,11 @@ get_observation_types_list, ) from langfuse._client.datasets import DatasetClient, DatasetItemClient +from langfuse._client.experiments import ( + ExperimentItem, + ExperimentItemResult, + ExperimentResult, +) from langfuse._client.environment_variables import ( LANGFUSE_DEBUG, LANGFUSE_HOST, @@ -2444,6 +2451,287 @@ def get_dataset( handle_fern_exception(e) raise e + def run_experiment( + self, + *, + name: str, + description: Optional[str] = None, + data: Union[ + List[Union[ExperimentItem, dict, DatasetItem]], List[DatasetItemClient] + ], + task: Callable[ + [Union[ExperimentItem, dict, DatasetItem, DatasetItemClient]], Any + ], + evaluators: Optional[List[Callable]] = None, + run_evaluators: Optional[List[Callable]] = None, + max_concurrency: Optional[int] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> ExperimentResult: + """Run an experiment on a dataset with automatic tracing and evaluation. + + This method executes a task function on each item in the provided dataset, + traces the execution with Langfuse, runs evaluators on the outputs, + and returns formatted results. + + Args: + name: Human-readable name for the experiment + description: Optional description of the experiment's purpose + data: Array of data items to process (ExperimentItem or DatasetItem) + task: Function that processes each data item and returns output + evaluators: Optional list of functions to evaluate each item's output + run_evaluators: Optional list of functions to evaluate the entire experiment + max_concurrency: Maximum number of concurrent task executions + metadata: Optional metadata to attach to the experiment + + Returns: + ExperimentResult containing item results, evaluations, and formatting functions + + Example: + ```python + def task(item): + return f"Processed: {item['input']}" + + def evaluator(*, input, output, expected_output=None, **kwargs): + return {"name": "length", "value": len(output)} + + result = langfuse.run_experiment( + name="Test Experiment", + data=[{"input": "test", "expected_output": "expected"}], + task=task, + evaluators=[evaluator] + ) + + print(result["item_results"]) + ``` + """ + return asyncio.run( + self._run_experiment_async( + name=name, + description=description, + data=data, + task=task, + evaluators=evaluators or [], + run_evaluators=run_evaluators or [], + max_concurrency=max_concurrency, + metadata=metadata or {}, + ) + ) + + async def _run_experiment_async( + self, + *, + name: str, + description: Optional[str], + data: Union[ + List[Union[ExperimentItem, dict, DatasetItem]], List[DatasetItemClient] + ], + task: Callable, + evaluators: List[Callable], + run_evaluators: List[Callable], + max_concurrency: Optional[int], + metadata: Dict[str, Any], + ) -> ExperimentResult: + """Internal async implementation of run_experiment.""" + from langfuse._client.experiments import _run_evaluator + + langfuse_logger.debug(f"Starting experiment '{name}' with {len(data)} items") + + # Set up concurrency control + max_workers = ( + max_concurrency if max_concurrency is not None else min(len(data), 10) + ) + semaphore = asyncio.Semaphore(max_workers) + + # Process all items + async def process_item( + item: Union[ExperimentItem, dict, DatasetItem, DatasetItemClient], + ) -> dict: + async with semaphore: + return await self._process_experiment_item( + item, task, evaluators, name, description, metadata + ) + + # Run all items concurrently + tasks = [process_item(item) for item in data] + item_results = await asyncio.gather(*tasks, return_exceptions=True) + + # Filter out any exceptions and log errors + valid_results: List[ExperimentItemResult] = [] + for i, result in enumerate(item_results): + if isinstance(result, Exception): + langfuse_logger.error(f"Item {i} failed: {result}") + elif isinstance(result, dict): + # Type-cast since we know the structure matches ExperimentItemResult + valid_results.append(result) # type: ignore + + # Run experiment-level evaluators + run_evaluations = [] + for run_evaluator in run_evaluators: + try: + evaluations = await _run_evaluator( + run_evaluator, item_results=valid_results + ) + run_evaluations.extend(evaluations) + except Exception as e: + langfuse_logger.error(f"Run evaluator failed: {e}") + + # Generate dataset run URL if applicable + dataset_run_id = ( + valid_results[0].get("dataset_run_id") if valid_results else None + ) + dataset_run_url = None + if dataset_run_id and data: + try: + # Check if the first item has dataset_id (for DatasetItem objects) + first_item = data[0] + dataset_id = None + if hasattr(first_item, "dataset_id"): + dataset_id = getattr(first_item, "dataset_id", None) + + if dataset_id: + project_id = self._get_project_id() + if project_id: + dataset_run_url = f"{self._host}/project/{project_id}/datasets/{dataset_id}/runs/{dataset_run_id}" + except Exception: + pass # URL generation is optional + + # Store run-level evaluations as scores + for evaluation in run_evaluations: + try: + if dataset_run_id: + self.create_score( + dataset_run_id=dataset_run_id, + name=evaluation["name"], + value=evaluation["value"], + comment=evaluation.get("comment"), + metadata=evaluation.get("metadata"), + ) + except Exception as e: + langfuse_logger.error(f"Failed to store run evaluation: {e}") + + return { + "item_results": valid_results, + "run_evaluations": run_evaluations, + "dataset_run_id": dataset_run_id, + "dataset_run_url": dataset_run_url, + } + + async def _process_experiment_item( + self, + item: Union[ExperimentItem, dict, DatasetItem, DatasetItemClient], + task: Callable, + evaluators: List[Callable], + experiment_name: str, + experiment_description: Optional[str], + experiment_metadata: Dict[str, Any], + ) -> dict: + """Process a single experiment item with tracing and evaluation.""" + from langfuse._client.experiments import _run_evaluator, _run_task + + # Execute task with tracing + span_name = "experiment-item-run" + with self.start_as_current_span(name=span_name) as span: + try: + # Run the task + output = await _run_task(task, item) + + # Update span with input/output + input_data = ( + item.get("input") + if isinstance(item, dict) + else getattr(item, "input", None) + ) + # Prepare metadata + item_metadata: Dict[str, Any] = {} + if isinstance(item, dict): + item_metadata = item.get("metadata", {}) or {} + + final_metadata = { + "experiment_name": experiment_name, + **experiment_metadata, + } + if isinstance(item_metadata, dict): + final_metadata.update(item_metadata) + + span.update( + input=input_data, + output=output, + metadata=final_metadata, + ) + + # Get trace ID for linking + trace_id = span.trace_id + dataset_run_id = None + + # Link to dataset run if this is a dataset item + if hasattr(item, "id") and hasattr(item, "dataset_id"): + try: + from langfuse.model import CreateDatasetRunItemRequest + + dataset_run_item = self.api.dataset_run_items.create( + request=CreateDatasetRunItemRequest( + runName=experiment_name, + runDescription=experiment_description, + metadata=experiment_metadata, + datasetItemId=item.id, # type: ignore + traceId=trace_id, + ) + ) + dataset_run_id = dataset_run_item.dataset_run_id + except Exception as e: + langfuse_logger.error(f"Failed to create dataset run item: {e}") + + # Run evaluators + evaluations = [] + for evaluator in evaluators: + try: + expected_output = None + if isinstance(item, dict): + expected_output = item.get("expected_output") + elif hasattr(item, "expected_output"): + expected_output = item.expected_output + + eval_metadata: Optional[Dict[str, Any]] = None + if isinstance(item, dict): + eval_metadata = item.get("metadata") + elif hasattr(item, "metadata"): + eval_metadata = item.metadata + + eval_results = await _run_evaluator( + evaluator, + input=input_data, + output=output, + expected_output=expected_output, + metadata=eval_metadata, + ) + evaluations.extend(eval_results) + + # Store evaluations as scores + for evaluation in eval_results: + self.create_score( + trace_id=trace_id, + name=evaluation["name"], + value=evaluation["value"], + comment=evaluation.get("comment"), + metadata=evaluation.get("metadata"), + ) + except Exception as e: + langfuse_logger.error(f"Evaluator failed: {e}") + + return { + "item": item, + "output": output, + "evaluations": evaluations, + "trace_id": trace_id, + "dataset_run_id": dataset_run_id, + } + + except Exception as e: + span.update( + output=f"Error: {str(e)}", level="ERROR", status_message=str(e) + ) + raise e + def auth_check(self) -> bool: """Check if the provided credentials (public and secret key) are valid. diff --git a/langfuse/_client/datasets.py b/langfuse/_client/datasets.py index f06570e57..4589d9d25 100644 --- a/langfuse/_client/datasets.py +++ b/langfuse/_client/datasets.py @@ -1,7 +1,7 @@ import datetime as dt import logging from .span import LangfuseSpan -from typing import TYPE_CHECKING, Any, Generator, List, Optional +from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional from opentelemetry.util._decorator import _agnosticcontextmanager @@ -181,3 +181,72 @@ def __init__(self, dataset: Dataset, items: List[DatasetItemClient]): self.created_at = dataset.created_at self.updated_at = dataset.updated_at self.items = items + self._langfuse: Optional["Langfuse"] = None + + def _get_langfuse_client(self) -> Optional["Langfuse"]: + """Get the Langfuse client from the first item.""" + if self._langfuse is None and self.items: + self._langfuse = self.items[0].langfuse + return self._langfuse + + def run_experiment( + self, + *, + name: str, + description: Optional[str] = None, + task: Any, + evaluators: Optional[List[Any]] = None, + run_evaluators: Optional[List[Any]] = None, + max_concurrency: Optional[int] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> Any: + """Run an experiment on this dataset. + + This is a convenience method that calls the Langfuse client's run_experiment + method with this dataset's items as the data. + + Args: + name: Human-readable name for the experiment + description: Optional description of the experiment's purpose + task: Function that processes each data item and returns output + evaluators: Optional list of functions to evaluate each item's output + run_evaluators: Optional list of functions to evaluate the entire experiment + max_concurrency: Maximum number of concurrent task executions + metadata: Optional metadata to attach to the experiment + + Returns: + ExperimentResult containing item results, evaluations, and formatting functions + + Example: + ```python + dataset = langfuse.get_dataset("my-dataset") + + def task(item): + return f"Processed: {item.input}" + + def evaluator(*, input, output, expected_output=None, **kwargs): + return {"name": "length", "value": len(output)} + + result = dataset.run_experiment( + name="Dataset Test Experiment", + task=task, + evaluators=[evaluator] + ) + + print(result["item_results"]) + ``` + """ + langfuse_client = self._get_langfuse_client() + if not langfuse_client: + raise ValueError("No Langfuse client available. Dataset items are empty.") + + return langfuse_client.run_experiment( + name=name, + description=description, + data=self.items, + task=task, + evaluators=evaluators, + run_evaluators=run_evaluators, + max_concurrency=max_concurrency, + metadata=metadata, + ) From 2cbf43b881d69c27ea6e669179820a4bcbceb8ca Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Thu, 11 Sep 2025 12:15:29 +0200 Subject: [PATCH 02/25] push --- langfuse/_client/experiments.py | 324 ++++++++++++++++++++ tests/test_experiments.py | 520 ++++++++++++++++++++++++++++++++ 2 files changed, 844 insertions(+) create mode 100644 langfuse/_client/experiments.py create mode 100644 tests/test_experiments.py diff --git a/langfuse/_client/experiments.py b/langfuse/_client/experiments.py new file mode 100644 index 000000000..db27153e0 --- /dev/null +++ b/langfuse/_client/experiments.py @@ -0,0 +1,324 @@ +"""Langfuse experiment functionality for running and evaluating tasks on datasets. + +This module provides the core experiment functionality for the Langfuse Python SDK, +allowing users to run experiments on datasets with automatic tracing, evaluation, +and result formatting. +""" + +import asyncio +import logging +from typing import ( + TYPE_CHECKING, + Any, + Awaitable, + Dict, + List, + Optional, + Protocol, + TypedDict, + Union, +) + +from langfuse.model import DatasetItem + +if TYPE_CHECKING: + from langfuse._client.datasets import DatasetItemClient + + +class ExperimentItem(TypedDict, total=False): + """Structure for experiment data items. + + Args: + input: The input data to pass to the task function + expected_output: Optional expected output for evaluation purposes + metadata: Optional metadata for the experiment item + """ + + input: Any + expected_output: Any + metadata: Optional[Dict[str, Any]] + + +class Evaluation(TypedDict, total=False): + """Structure for evaluation results. + + Args: + name: Name of the evaluation metric + value: The evaluation score/value (numeric or string) + comment: Optional comment explaining the evaluation + metadata: Optional metadata for the evaluation + """ + + name: str + value: Union[int, float, str, bool] + comment: Optional[str] + metadata: Optional[Dict[str, Any]] + + +class ExperimentItemResult(TypedDict): + """Result structure for individual experiment items. + + Args: + item: The original experiment item that was processed + output: The actual output produced by the task + evaluations: List of evaluation results for this item + trace_id: Langfuse trace ID for this item's execution + dataset_run_id: Dataset run ID if this item was part of a Langfuse dataset + """ + + item: Union[ExperimentItem, DatasetItem] + output: Any + evaluations: List[Evaluation] + trace_id: Optional[str] + dataset_run_id: Optional[str] + + +class ExperimentResult(TypedDict): + """Complete result structure for experiment execution. + + Args: + item_results: Results from processing each individual data item + run_evaluations: Results from run-level evaluators + dataset_run_id: ID of the dataset run (if using Langfuse datasets) + dataset_run_url: URL to view the dataset run in Langfuse UI + """ + + item_results: List[ExperimentItemResult] + run_evaluations: List[Evaluation] + dataset_run_id: Optional[str] + dataset_run_url: Optional[str] + + +class TaskFunction(Protocol): + """Protocol for experiment task functions.""" + + def __call__( + self, item: Union[ExperimentItem, dict, DatasetItem, "DatasetItemClient"] + ) -> Union[Any, Awaitable[Any]]: + """Execute the task on an experiment item. + + Args: + item: The experiment or dataset item to process + + Returns: + The task output (can be sync or async) + """ + ... + + +class EvaluatorFunction(Protocol): + """Protocol for item-level evaluator functions.""" + + def __call__( + self, + *, + input: Any, + output: Any, + expected_output: Any = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> Union[ + Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]] + ]: + """Evaluate a task output. + + Args: + input: The original input to the task + output: The output produced by the task + expected_output: The expected output (if available) + metadata: Optional metadata from the experiment item + + Returns: + Single evaluation or list of evaluations (can be sync or async) + """ + ... + + +class RunEvaluatorFunction(Protocol): + """Protocol for run-level evaluator functions.""" + + def __call__( + self, *, item_results: List[ExperimentItemResult] + ) -> Union[ + Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]] + ]: + """Evaluate the entire experiment run. + + Args: + item_results: Results from all processed experiment items + + Returns: + Single evaluation or list of evaluations (can be sync or async) + """ + ... + + +def format_experiment_results( + item_results: List[ExperimentItemResult], + run_evaluations: List[Evaluation], + experiment_name: str, + experiment_description: Optional[str] = None, + dataset_run_url: Optional[str] = None, + include_item_results: bool = False, +) -> str: + """Format experiment results for display. + + Args: + item_results: Results from processing each item + run_evaluations: Results from run-level evaluators + experiment_name: Name of the experiment + experiment_description: Optional description of the experiment + dataset_run_url: Optional URL to dataset run in Langfuse UI + include_item_results: Whether to include individual item details + + Returns: + Formatted string representation of the results + """ + if not item_results: + return "No experiment results to display." + + output = "" + + # Individual results + if include_item_results: + for i, result in enumerate(item_results): + output += f"\n{i + 1}. Item {i + 1}:\n" + + # Input, expected, and actual + item_input = None + if isinstance(result["item"], dict): + item_input = result["item"].get("input") + elif hasattr(result["item"], "input"): + item_input = result["item"].input + + if item_input is not None: + output += f" Input: {_format_value(item_input)}\n" + + expected_output = None + if isinstance(result["item"], dict): + expected_output = result["item"].get("expected_output") + elif hasattr(result["item"], "expected_output"): + expected_output = result["item"].expected_output + + if expected_output is not None: + output += f" Expected: {_format_value(expected_output)}\n" + output += f" Actual: {_format_value(result['output'])}\n" + + # Scores + if result["evaluations"]: + output += " Scores:\n" + for evaluation in result["evaluations"]: + score = evaluation["value"] + if isinstance(score, (int, float)): + score = f"{score:.3f}" + output += f" โ€ข {evaluation['name']}: {score}" + if evaluation.get("comment"): + output += f"\n ๐Ÿ’ญ {evaluation['comment']}" + output += "\n" + + # Trace link + if result.get("trace_id"): + # Note: We'd need the langfuse client to generate the actual URL + output += f"\n Trace ID: {result['trace_id']}\n" + else: + output += f"Individual Results: Hidden ({len(item_results)} items)\n" + output += "๐Ÿ’ก Set include_item_results=True to view them\n" + + # Experiment Overview + output += f"\n{'โ”€' * 50}\n" + output += f"๐Ÿ“Š {experiment_name}" + if experiment_description: + output += f" - {experiment_description}" + + output += f"\n{len(item_results)} items" + + # Get unique evaluation names + evaluation_names = set() + for result in item_results: + for evaluation in result["evaluations"]: + evaluation_names.add(evaluation["name"]) + + if evaluation_names: + output += "\nEvaluations:" + for eval_name in evaluation_names: + output += f"\n โ€ข {eval_name}" + output += "\n" + + # Average scores + if evaluation_names: + output += "\nAverage Scores:" + for eval_name in evaluation_names: + scores = [] + for result in item_results: + for evaluation in result["evaluations"]: + if evaluation["name"] == eval_name and isinstance( + evaluation["value"], (int, float) + ): + scores.append(evaluation["value"]) + + if scores: + avg = sum(scores) / len(scores) + output += f"\n โ€ข {eval_name}: {avg:.3f}" + output += "\n" + + # Run evaluations + if run_evaluations: + output += "\nRun Evaluations:" + for run_eval in run_evaluations: + score = run_eval["value"] + if isinstance(score, (int, float)): + score = f"{score:.3f}" + output += f"\n โ€ข {run_eval['name']}: {score}" + if run_eval.get("comment"): + output += f"\n ๐Ÿ’ญ {run_eval['comment']}" + output += "\n" + + if dataset_run_url: + output += f"\n๐Ÿ”— Dataset Run:\n {dataset_run_url}" + + return output + + +def _format_value(value: Any) -> str: + """Format a value for display.""" + if isinstance(value, str): + return value[:50] + "..." if len(value) > 50 else value + return str(value) + + +async def _run_evaluator( + evaluator: EvaluatorFunction, **kwargs: Any +) -> List[Evaluation]: + """Run an evaluator function and normalize the result.""" + try: + result = evaluator(**kwargs) + + # Handle async evaluators + if asyncio.iscoroutine(result): + result = await result + + # Normalize to list + if isinstance(result, dict): + return [result] + elif isinstance(result, list): + return result + else: + return [] + + except Exception as e: + evaluator_name = getattr(evaluator, "__name__", "unknown_evaluator") + logging.getLogger("langfuse").error(f"Evaluator {evaluator_name} failed: {e}") + return [] + + +async def _run_task( + task: TaskFunction, + item: Union[ExperimentItem, dict, DatasetItem, "DatasetItemClient"], +) -> Any: + """Run a task function and handle sync/async.""" + result = task(item) + + # Handle async tasks + if asyncio.iscoroutine(result): + result = await result + + return result diff --git a/tests/test_experiments.py b/tests/test_experiments.py new file mode 100644 index 000000000..4384001f4 --- /dev/null +++ b/tests/test_experiments.py @@ -0,0 +1,520 @@ +"""Comprehensive tests for Langfuse experiment functionality matching JS SDK.""" + +import time + +import pytest + +from langfuse import get_client +from tests.utils import create_uuid, get_api + + +@pytest.fixture +def sample_dataset(): + """Sample dataset for experiments.""" + return [ + {"input": "Germany", "expected_output": "Berlin"}, + {"input": "France", "expected_output": "Paris"}, + {"input": "Spain", "expected_output": "Madrid"}, + ] + + +def mock_task(item): + """Mock task function that simulates processing.""" + input_val = ( + item.get("input") + if isinstance(item, dict) + else getattr(item, "input", "unknown") + ) + return f"Capital of {input_val}" + + +def simple_evaluator(*, input, output, expected_output=None, **kwargs): + """Simple evaluator that returns output length.""" + return {"name": "length_check", "value": len(output)} + + +def factuality_evaluator(*, input, output, expected_output=None, **kwargs): + """Mock factuality evaluator.""" + # Simple mock: check if expected output is in the output + if expected_output and expected_output.lower() in output.lower(): + return {"name": "factuality", "value": 1.0, "comment": "Correct answer found"} + return {"name": "factuality", "value": 0.0, "comment": "Incorrect answer"} + + +def run_evaluator_average_length(*, item_results, **kwargs): + """Run evaluator that calculates average output length.""" + if not item_results: + return {"name": "average_length", "value": 0} + + avg_length = sum(len(r["output"]) for r in item_results) / len(item_results) + return {"name": "average_length", "value": avg_length} + + +# Basic Functionality Tests +def test_run_experiment_on_local_dataset(sample_dataset): + """Test running experiment on local dataset.""" + langfuse_client = get_client() + result = langfuse_client.run_experiment( + name="Euro capitals", + description="Country capital experiment", + data=sample_dataset, + task=mock_task, + evaluators=[simple_evaluator, factuality_evaluator], + run_evaluators=[run_evaluator_average_length], + ) + + # Validate basic result structure + assert len(result["item_results"]) == 3 + assert len(result["run_evaluations"]) == 1 + assert result["run_evaluations"][0]["name"] == "average_length" + assert result["dataset_run_id"] is None # No dataset_run_id for local datasets + + # Validate item results structure + for item_result in result["item_results"]: + assert "output" in item_result + assert "evaluations" in item_result + assert "trace_id" in item_result + assert ( + item_result["dataset_run_id"] is None + ) # No dataset_run_id for local datasets + assert len(item_result["evaluations"]) == 2 # Both evaluators should run + + # Flush and wait for server processing + langfuse_client.flush() + time.sleep(2) + + +def test_run_experiment_on_langfuse_dataset(): + """Test running experiment on Langfuse dataset.""" + langfuse_client = get_client() + # Create dataset + dataset_name = "test-dataset-" + create_uuid() + langfuse_client.create_dataset(name=dataset_name) + + # Add items to dataset + test_items = [ + {"input": "Germany", "expected_output": "Berlin"}, + {"input": "France", "expected_output": "Paris"}, + ] + + for item in test_items: + langfuse_client.create_dataset_item( + dataset_name=dataset_name, + input=item["input"], + expected_output=item["expected_output"], + ) + + # Get dataset and run experiment + dataset = langfuse_client.get_dataset(dataset_name) + + result = dataset.run_experiment( + name="Dataset Test", + description="Test on Langfuse dataset", + task=mock_task, + evaluators=[factuality_evaluator], + ) + + # Should have dataset run ID for Langfuse datasets + assert result["dataset_run_id"] is not None + assert len(result["item_results"]) == 2 + assert all(item["dataset_run_id"] is not None for item in result["item_results"]) + + # Flush and wait for server processing + langfuse_client.flush() + time.sleep(3) + + # Verify dataset run exists via API + api = get_api() + runs = api.datasets.get_runs(dataset_name) + assert len(runs.data) >= 1 + + +# Error Handling Tests +def test_evaluator_failures_handled_gracefully(): + """Test that evaluator failures don't break the experiment.""" + langfuse_client = get_client() + + def failing_evaluator(**kwargs): + raise Exception("Evaluator failed") + + def working_evaluator(**kwargs): + return {"name": "working_eval", "value": 1.0} + + result = langfuse_client.run_experiment( + name="Error test", + data=[{"input": "test"}], + task=lambda x: "result", + evaluators=[working_evaluator, failing_evaluator], + ) + + # Should complete with only working evaluator + assert len(result["item_results"]) == 1 + # Only the working evaluator should have produced results + assert ( + len( + [ + eval + for eval in result["item_results"][0]["evaluations"] + if eval["name"] == "working_eval" + ] + ) + == 1 + ) + + langfuse_client.flush() + time.sleep(1) + + +def test_task_failures_handled_gracefully(): + """Test that task failures are handled gracefully and don't stop the experiment.""" + langfuse_client = get_client() + + def failing_task(item): + raise Exception("Task failed") + + def working_task(item): + return f"Processed: {item['input']}" + + # Test with mixed data - some will fail, some will succeed + result = langfuse_client.run_experiment( + name="Task error test", + data=[{"input": "test1"}, {"input": "test2"}], + task=failing_task, + ) + + # Should complete but with no valid results since all tasks failed + assert len(result["item_results"]) == 0 + + langfuse_client.flush() + time.sleep(1) + + +def test_run_evaluator_failures_handled(): + """Test that run evaluator failures don't break the experiment.""" + langfuse_client = get_client() + + def failing_run_evaluator(**kwargs): + raise Exception("Run evaluator failed") + + result = langfuse_client.run_experiment( + name="Run evaluator error test", + data=[{"input": "test"}], + task=lambda x: "result", + run_evaluators=[failing_run_evaluator], + ) + + # Should complete but run evaluations should be empty + assert len(result["item_results"]) == 1 + assert len(result["run_evaluations"]) == 0 + + langfuse_client.flush() + time.sleep(1) + + +# Edge Cases Tests +def test_empty_dataset_handling(): + """Test experiment with empty dataset.""" + langfuse_client = get_client() + + result = langfuse_client.run_experiment( + name="Empty dataset test", + data=[], + task=lambda x: "result", + run_evaluators=[run_evaluator_average_length], + ) + + assert len(result["item_results"]) == 0 + assert len(result["run_evaluations"]) == 1 # Run evaluators still execute + + langfuse_client.flush() + time.sleep(1) + + +def test_dataset_with_missing_fields(): + """Test handling dataset with missing fields.""" + langfuse_client = get_client() + + incomplete_dataset = [ + {"input": "Germany"}, # Missing expected_output + {"expected_output": "Paris"}, # Missing input + {"input": "Spain", "expected_output": "Madrid"}, # Complete + ] + + result = langfuse_client.run_experiment( + name="Incomplete data test", + data=incomplete_dataset, + task=lambda x: "result", + ) + + # Should handle missing fields gracefully + assert len(result["item_results"]) == 3 + for item_result in result["item_results"]: + assert "trace_id" in item_result + assert "output" in item_result + + langfuse_client.flush() + time.sleep(1) + + +def test_large_dataset_with_concurrency(): + """Test handling large dataset with concurrency control.""" + langfuse_client = get_client() + + large_dataset = [ + {"input": f"Item {i}", "expected_output": f"Output {i}"} for i in range(20) + ] + + result = langfuse_client.run_experiment( + name="Large dataset test", + data=large_dataset, + task=lambda x: f"Processed {x['input']}", + evaluators=[lambda **kwargs: {"name": "simple_eval", "value": 1.0}], + max_concurrency=5, + ) + + assert len(result["item_results"]) == 20 + for item_result in result["item_results"]: + assert len(item_result["evaluations"]) == 1 + assert "trace_id" in item_result + + langfuse_client.flush() + time.sleep(3) + + +# Evaluator Configuration Tests +def test_single_evaluation_return(): + """Test evaluators returning single evaluation instead of array.""" + langfuse_client = get_client() + + def single_evaluator(**kwargs): + return {"name": "single_eval", "value": 1, "comment": "Single evaluation"} + + result = langfuse_client.run_experiment( + name="Single evaluation test", + data=[{"input": "test"}], + task=lambda x: "result", + evaluators=[single_evaluator], + ) + + assert len(result["item_results"]) == 1 + assert len(result["item_results"][0]["evaluations"]) == 1 + assert result["item_results"][0]["evaluations"][0]["name"] == "single_eval" + + langfuse_client.flush() + time.sleep(1) + + +def test_no_evaluators(): + """Test experiment with no evaluators.""" + langfuse_client = get_client() + + result = langfuse_client.run_experiment( + name="No evaluators test", + data=[{"input": "test"}], + task=lambda x: "result", + evaluators=[], + ) + + assert len(result["item_results"]) == 1 + assert len(result["item_results"][0]["evaluations"]) == 0 + assert len(result["run_evaluations"]) == 0 + + langfuse_client.flush() + time.sleep(1) + + +def test_only_run_evaluators(): + """Test experiment with only run evaluators.""" + langfuse_client = get_client() + + def run_only_evaluator(**kwargs): + return { + "name": "run_only_eval", + "value": 10, + "comment": "Run-level evaluation", + } + + result = langfuse_client.run_experiment( + name="Only run evaluators test", + data=[{"input": "test"}], + task=lambda x: "result", + evaluators=[], + run_evaluators=[run_only_evaluator], + ) + + assert len(result["item_results"]) == 1 + assert len(result["item_results"][0]["evaluations"]) == 0 # No item evaluations + assert len(result["run_evaluations"]) == 1 + assert result["run_evaluations"][0]["name"] == "run_only_eval" + + langfuse_client.flush() + time.sleep(1) + + +def test_different_data_types(): + """Test evaluators returning different data types.""" + langfuse_client = get_client() + + def number_evaluator(**kwargs): + return {"name": "number_eval", "value": 42} + + def string_evaluator(**kwargs): + return {"name": "string_eval", "value": "excellent"} + + def boolean_evaluator(**kwargs): + return {"name": "boolean_eval", "value": True} + + result = langfuse_client.run_experiment( + name="Different data types test", + data=[{"input": "test"}], + task=lambda x: "result", + evaluators=[number_evaluator, string_evaluator, boolean_evaluator], + ) + + evaluations = result["item_results"][0]["evaluations"] + assert len(evaluations) == 3 + + eval_by_name = {e["name"]: e["value"] for e in evaluations} + assert eval_by_name["number_eval"] == 42 + assert eval_by_name["string_eval"] == "excellent" + assert eval_by_name["boolean_eval"] is True + + langfuse_client.flush() + time.sleep(1) + + +# Data Persistence Tests +def test_scores_are_persisted(): + """Test that scores are properly persisted to the database.""" + langfuse_client = get_client() + + # Create dataset + dataset_name = "score-persistence-" + create_uuid() + langfuse_client.create_dataset(name=dataset_name) + + langfuse_client.create_dataset_item( + dataset_name=dataset_name, + input="Test input", + expected_output="Test output", + ) + + dataset = langfuse_client.get_dataset(dataset_name) + + def test_evaluator(**kwargs): + return { + "name": "persistence_test", + "value": 0.85, + "comment": "Test evaluation for persistence", + } + + def test_run_evaluator(**kwargs): + return { + "name": "persistence_run_test", + "value": 0.9, + "comment": "Test run evaluation for persistence", + } + + result = dataset.run_experiment( + name="Score persistence test", + description="Test score persistence", + task=mock_task, + evaluators=[test_evaluator], + run_evaluators=[test_run_evaluator], + ) + + assert result["dataset_run_id"] is not None + assert len(result["item_results"]) == 1 + assert len(result["run_evaluations"]) == 1 + + langfuse_client.flush() + time.sleep(3) + + # Verify scores are persisted via API + api = get_api() + runs = api.datasets.get_runs(dataset_name) + assert len(runs.data) >= 1 + + # Verify the run exists with correct name + run_names = [run.name for run in runs.data] + assert "Score persistence test" in run_names + + +def test_multiple_experiments_on_same_dataset(): + """Test running multiple experiments on the same dataset.""" + langfuse_client = get_client() + + # Create dataset + dataset_name = "multi-experiment-" + create_uuid() + langfuse_client.create_dataset(name=dataset_name) + + for item in [ + {"input": "Germany", "expected_output": "Berlin"}, + {"input": "France", "expected_output": "Paris"}, + ]: + langfuse_client.create_dataset_item( + dataset_name=dataset_name, + input=item["input"], + expected_output=item["expected_output"], + ) + + dataset = langfuse_client.get_dataset(dataset_name) + + # Run first experiment + result1 = dataset.run_experiment( + name="Experiment 1", + description="First experiment", + task=mock_task, + evaluators=[factuality_evaluator], + ) + + langfuse_client.flush() + time.sleep(2) + + # Run second experiment + result2 = dataset.run_experiment( + name="Experiment 2", + description="Second experiment", + task=mock_task, + evaluators=[simple_evaluator], + ) + + langfuse_client.flush() + time.sleep(2) + + # Both experiments should have different run IDs + assert result1["dataset_run_id"] is not None + assert result2["dataset_run_id"] is not None + assert result1["dataset_run_id"] != result2["dataset_run_id"] + + # Verify both runs exist in database + api = get_api() + runs = api.datasets.get_runs(dataset_name) + assert len(runs.data) >= 2 + + run_names = [run.name for run in runs.data] + assert "Experiment 1" in run_names + assert "Experiment 2" in run_names + + +# Result Formatting Tests +def test_format_experiment_results_basic(): + """Test basic result formatting functionality.""" + langfuse_client = get_client() + + result = langfuse_client.run_experiment( + name="Formatting test", + description="Test result formatting", + data=[{"input": "Hello", "expected_output": "Hi"}], + task=lambda x: f"Processed: {x['input']}", + evaluators=[simple_evaluator], + run_evaluators=[run_evaluator_average_length], + ) + + # Basic validation that result structure is correct for formatting + assert len(result["item_results"]) == 1 + assert len(result["run_evaluations"]) == 1 + assert "trace_id" in result["item_results"][0] + assert "evaluations" in result["item_results"][0] + + langfuse_client.flush() + time.sleep(1) From 9eee51d9449ffc0dda664a957a330b9a18f21ff9 Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Thu, 11 Sep 2025 15:51:00 +0200 Subject: [PATCH 03/25] push --- langfuse/_client/client.py | 71 +++++++++++++++++++++++--------------- tests/test_experiments.py | 1 + 2 files changed, 45 insertions(+), 27 deletions(-) diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py index 7bfe2ac52..6f3722990 100644 --- a/langfuse/_client/client.py +++ b/langfuse/_client/client.py @@ -46,11 +46,6 @@ get_observation_types_list, ) from langfuse._client.datasets import DatasetClient, DatasetItemClient -from langfuse._client.experiments import ( - ExperimentItem, - ExperimentItemResult, - ExperimentResult, -) from langfuse._client.environment_variables import ( LANGFUSE_DEBUG, LANGFUSE_HOST, @@ -61,6 +56,13 @@ LANGFUSE_TRACING_ENABLED, LANGFUSE_TRACING_ENVIRONMENT, ) +from langfuse._client.experiments import ( + ExperimentItem, + ExperimentItemResult, + ExperimentResult, + _run_evaluator, + _run_task, +) from langfuse._client.resource_manager import LangfuseResourceManager from langfuse._client.span import ( LangfuseAgent, @@ -742,7 +744,7 @@ def start_generation( cost_details: Optional[Dict[str, float]] = None, prompt: Optional[PromptClient] = None, ) -> LangfuseGeneration: - """[DEPRECATED] Create a new generation span for model generations. + """Create a new generation span for model generations. DEPRECATED: This method is deprecated and will be removed in a future version. Use start_observation(as_type='generation') instead. @@ -838,7 +840,7 @@ def start_as_current_generation( prompt: Optional[PromptClient] = None, end_on_exit: Optional[bool] = None, ) -> _AgnosticContextManager[LangfuseGeneration]: - """[DEPRECATED] Create a new generation span and set it as the current span in a context manager. + """Create a new generation span and set it as the current span in a context manager. DEPRECATED: This method is deprecated and will be removed in a future version. Use start_as_current_observation(as_type='generation') instead. @@ -2531,9 +2533,6 @@ async def _run_experiment_async( max_concurrency: Optional[int], metadata: Dict[str, Any], ) -> ExperimentResult: - """Internal async implementation of run_experiment.""" - from langfuse._client.experiments import _run_evaluator - langfuse_logger.debug(f"Starting experiment '{name}' with {len(data)} items") # Set up concurrency control @@ -2561,7 +2560,6 @@ async def process_item( if isinstance(result, Exception): langfuse_logger.error(f"Item {i} failed: {result}") elif isinstance(result, dict): - # Type-cast since we know the structure matches ExperimentItemResult valid_results.append(result) # type: ignore # Run experiment-level evaluators @@ -2585,13 +2583,16 @@ async def process_item( # Check if the first item has dataset_id (for DatasetItem objects) first_item = data[0] dataset_id = None + if hasattr(first_item, "dataset_id"): dataset_id = getattr(first_item, "dataset_id", None) if dataset_id: project_id = self._get_project_id() + if project_id: dataset_run_url = f"{self._host}/project/{project_id}/datasets/{dataset_id}/runs/{dataset_run_id}" + except Exception: pass # URL generation is optional @@ -2606,6 +2607,7 @@ async def process_item( comment=evaluation.get("comment"), metadata=evaluation.get("metadata"), ) + except Exception as e: langfuse_logger.error(f"Failed to store run evaluation: {e}") @@ -2625,31 +2627,38 @@ async def _process_experiment_item( experiment_description: Optional[str], experiment_metadata: Dict[str, Any], ) -> dict: - """Process a single experiment item with tracing and evaluation.""" - from langfuse._client.experiments import _run_evaluator, _run_task - # Execute task with tracing span_name = "experiment-item-run" + with self.start_as_current_span(name=span_name) as span: try: - # Run the task output = await _run_task(task, item) - # Update span with input/output input_data = ( item.get("input") if isinstance(item, dict) else getattr(item, "input", None) ) - # Prepare metadata + item_metadata: Dict[str, Any] = {} + if isinstance(item, dict): - item_metadata = item.get("metadata", {}) or {} + item_metadata = item.get("metadata", None) or {} final_metadata = { "experiment_name": experiment_name, **experiment_metadata, } + + if ( + not isinstance(item, dict) + and hasattr(item, "dataset_id") + and hasattr(item, "id") + ): + final_metadata.update( + {"dataset_id": item.dataset_id, "dataset_item_id": item.id} + ) + if isinstance(item_metadata, dict): final_metadata.update(item_metadata) @@ -2668,30 +2677,37 @@ async def _process_experiment_item( try: from langfuse.model import CreateDatasetRunItemRequest - dataset_run_item = self.api.dataset_run_items.create( - request=CreateDatasetRunItemRequest( - runName=experiment_name, - runDescription=experiment_description, - metadata=experiment_metadata, - datasetItemId=item.id, # type: ignore - traceId=trace_id, + dataset_run_item = ( + await self.async_api.dataset_run_items.create( + request=CreateDatasetRunItemRequest( + runName=experiment_name, + runDescription=experiment_description, + metadata=experiment_metadata, + datasetItemId=item.id, # type: ignore + traceId=trace_id, + ) ) ) + dataset_run_id = dataset_run_item.dataset_run_id + except Exception as e: langfuse_logger.error(f"Failed to create dataset run item: {e}") # Run evaluators evaluations = [] + for evaluator in evaluators: try: expected_output = None + if isinstance(item, dict): expected_output = item.get("expected_output") elif hasattr(item, "expected_output"): expected_output = item.expected_output eval_metadata: Optional[Dict[str, Any]] = None + if isinstance(item, dict): eval_metadata = item.get("metadata") elif hasattr(item, "metadata"): @@ -2710,11 +2726,12 @@ async def _process_experiment_item( for evaluation in eval_results: self.create_score( trace_id=trace_id, - name=evaluation["name"], - value=evaluation["value"], + name=evaluation.get("name", "unknown"), + value=evaluation.get("value", -1), comment=evaluation.get("comment"), metadata=evaluation.get("metadata"), ) + except Exception as e: langfuse_logger.error(f"Evaluator failed: {e}") diff --git a/tests/test_experiments.py b/tests/test_experiments.py index 4384001f4..86cf0845c 100644 --- a/tests/test_experiments.py +++ b/tests/test_experiments.py @@ -112,6 +112,7 @@ def test_run_experiment_on_langfuse_dataset(): description="Test on Langfuse dataset", task=mock_task, evaluators=[factuality_evaluator], + run_evaluators=[run_evaluator_average_length], ) # Should have dataset run ID for Langfuse datasets From 00565f698039bb93fdd9547950231f978134ecca Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Thu, 11 Sep 2025 17:07:42 +0200 Subject: [PATCH 04/25] push --- langfuse/_client/client.py | 41 ++++++------- langfuse/_client/datasets.py | 16 +++-- langfuse/_client/experiments.py | 32 ++++++---- tests/test_experiments.py | 101 +++++++++++++++++++------------- 4 files changed, 108 insertions(+), 82 deletions(-) diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py index 6f3722990..b39ce8a0d 100644 --- a/langfuse/_client/client.py +++ b/langfuse/_client/client.py @@ -57,9 +57,13 @@ LANGFUSE_TRACING_ENVIRONMENT, ) from langfuse._client.experiments import ( + EvaluatorFunction, + ExperimentData, ExperimentItem, ExperimentItemResult, ExperimentResult, + RunEvaluatorFunction, + TaskFunction, _run_evaluator, _run_task, ) @@ -2458,15 +2462,11 @@ def run_experiment( *, name: str, description: Optional[str] = None, - data: Union[ - List[Union[ExperimentItem, dict, DatasetItem]], List[DatasetItemClient] - ], - task: Callable[ - [Union[ExperimentItem, dict, DatasetItem, DatasetItemClient]], Any - ], - evaluators: Optional[List[Callable]] = None, - run_evaluators: Optional[List[Callable]] = None, - max_concurrency: Optional[int] = None, + data: ExperimentData, + task: TaskFunction, + evaluators: List[EvaluatorFunction] = [], + run_evaluators: List[RunEvaluatorFunction] = [], + max_concurrency: int = 50, metadata: Optional[Dict[str, Any]] = None, ) -> ExperimentResult: """Run an experiment on a dataset with automatic tracing and evaluation. @@ -2524,27 +2524,20 @@ async def _run_experiment_async( *, name: str, description: Optional[str], - data: Union[ - List[Union[ExperimentItem, dict, DatasetItem]], List[DatasetItemClient] - ], - task: Callable, - evaluators: List[Callable], - run_evaluators: List[Callable], - max_concurrency: Optional[int], + data: ExperimentData, + task: TaskFunction, + evaluators: List[EvaluatorFunction], + run_evaluators: List[RunEvaluatorFunction], + max_concurrency: int, metadata: Dict[str, Any], ) -> ExperimentResult: langfuse_logger.debug(f"Starting experiment '{name}' with {len(data)} items") # Set up concurrency control - max_workers = ( - max_concurrency if max_concurrency is not None else min(len(data), 10) - ) - semaphore = asyncio.Semaphore(max_workers) + semaphore = asyncio.Semaphore(max_concurrency) # Process all items - async def process_item( - item: Union[ExperimentItem, dict, DatasetItem, DatasetItemClient], - ) -> dict: + async def process_item(item: ExperimentItem) -> dict: async with semaphore: return await self._process_experiment_item( item, task, evaluators, name, description, metadata @@ -2620,7 +2613,7 @@ async def process_item( async def _process_experiment_item( self, - item: Union[ExperimentItem, dict, DatasetItem, DatasetItemClient], + item: ExperimentItem, task: Callable, evaluators: List[Callable], experiment_name: str, diff --git a/langfuse/_client/datasets.py b/langfuse/_client/datasets.py index 4589d9d25..af79520b1 100644 --- a/langfuse/_client/datasets.py +++ b/langfuse/_client/datasets.py @@ -1,10 +1,14 @@ import datetime as dt import logging -from .span import LangfuseSpan from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional from opentelemetry.util._decorator import _agnosticcontextmanager +from langfuse._client.experiments import ( + EvaluatorFunction, + RunEvaluatorFunction, + TaskFunction, +) from langfuse.model import ( CreateDatasetRunItemRequest, Dataset, @@ -12,6 +16,8 @@ DatasetStatus, ) +from .span import LangfuseSpan + if TYPE_CHECKING: from langfuse._client.client import Langfuse @@ -194,10 +200,10 @@ def run_experiment( *, name: str, description: Optional[str] = None, - task: Any, - evaluators: Optional[List[Any]] = None, - run_evaluators: Optional[List[Any]] = None, - max_concurrency: Optional[int] = None, + task: TaskFunction, + evaluators: List[EvaluatorFunction] = [], + run_evaluators: List[RunEvaluatorFunction] = [], + max_concurrency: int = 50, metadata: Optional[Dict[str, Any]] = None, ) -> Any: """Run an experiment on this dataset. diff --git a/langfuse/_client/experiments.py b/langfuse/_client/experiments.py index db27153e0..65aad2649 100644 --- a/langfuse/_client/experiments.py +++ b/langfuse/_client/experiments.py @@ -19,13 +19,11 @@ Union, ) -from langfuse.model import DatasetItem - if TYPE_CHECKING: from langfuse._client.datasets import DatasetItemClient -class ExperimentItem(TypedDict, total=False): +class LocalExperimentItem(TypedDict, total=False): """Structure for experiment data items. Args: @@ -39,6 +37,10 @@ class ExperimentItem(TypedDict, total=False): metadata: Optional[Dict[str, Any]] +ExperimentItem = Union[LocalExperimentItem, DatasetItemClient] +ExperimentData = Union[List[LocalExperimentItem], List[DatasetItemClient]] + + class Evaluation(TypedDict, total=False): """Structure for evaluation results. @@ -66,7 +68,7 @@ class ExperimentItemResult(TypedDict): dataset_run_id: Dataset run ID if this item was part of a Langfuse dataset """ - item: Union[ExperimentItem, DatasetItem] + item: ExperimentItem output: Any evaluations: List[Evaluation] trace_id: Optional[str] @@ -93,7 +95,10 @@ class TaskFunction(Protocol): """Protocol for experiment task functions.""" def __call__( - self, item: Union[ExperimentItem, dict, DatasetItem, "DatasetItemClient"] + self, + *, + item: ExperimentItem, + **kwargs: Dict[str, Any], ) -> Union[Any, Awaitable[Any]]: """Execute the task on an experiment item. @@ -116,6 +121,7 @@ def __call__( output: Any, expected_output: Any = None, metadata: Optional[Dict[str, Any]] = None, + **kwargs: Dict[str, Any], ) -> Union[ Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]] ]: @@ -137,7 +143,10 @@ class RunEvaluatorFunction(Protocol): """Protocol for run-level evaluator functions.""" def __call__( - self, *, item_results: List[ExperimentItemResult] + self, + *, + item_results: List[ExperimentItemResult], + **kwargs: Dict[str, Any], ) -> Union[ Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]] ]: @@ -286,7 +295,7 @@ def _format_value(value: Any) -> str: async def _run_evaluator( - evaluator: EvaluatorFunction, **kwargs: Any + evaluator: Union[EvaluatorFunction, RunEvaluatorFunction], **kwargs: Any ) -> List[Evaluation]: """Run an evaluator function and normalize the result.""" try: @@ -299,8 +308,10 @@ async def _run_evaluator( # Normalize to list if isinstance(result, dict): return [result] + elif isinstance(result, list): return result + else: return [] @@ -310,12 +321,9 @@ async def _run_evaluator( return [] -async def _run_task( - task: TaskFunction, - item: Union[ExperimentItem, dict, DatasetItem, "DatasetItemClient"], -) -> Any: +async def _run_task(task: TaskFunction, item: ExperimentItem) -> Any: """Run a task function and handle sync/async.""" - result = task(item) + result = task(item=item) # Handle async tasks if asyncio.iscoroutine(result): diff --git a/tests/test_experiments.py b/tests/test_experiments.py index 86cf0845c..2a20421ba 100644 --- a/tests/test_experiments.py +++ b/tests/test_experiments.py @@ -1,10 +1,17 @@ """Comprehensive tests for Langfuse experiment functionality matching JS SDK.""" import time +from typing import Any, Dict, List import pytest from langfuse import get_client +from langfuse._client.experiments import ( + Evaluation, + ExperimentData, + ExperimentItem, + ExperimentItemResult, +) from tests.utils import create_uuid, get_api @@ -18,7 +25,7 @@ def sample_dataset(): ] -def mock_task(item): +def mock_task(*, item: ExperimentItem, **kwargs: Dict[str, Any]): """Mock task function that simulates processing.""" input_val = ( item.get("input") @@ -29,31 +36,37 @@ def mock_task(item): def simple_evaluator(*, input, output, expected_output=None, **kwargs): - """Simple evaluator that returns output length.""" - return {"name": "length_check", "value": len(output)} + """Return output length.""" + return Evaluation(**{"name": "length_check", "value": len(output)}) def factuality_evaluator(*, input, output, expected_output=None, **kwargs): """Mock factuality evaluator.""" # Simple mock: check if expected output is in the output if expected_output and expected_output.lower() in output.lower(): - return {"name": "factuality", "value": 1.0, "comment": "Correct answer found"} - return {"name": "factuality", "value": 0.0, "comment": "Incorrect answer"} + return Evaluation( + **{"name": "factuality", "value": 1.0, "comment": "Correct answer found"} + ) + return Evaluation( + **{"name": "factuality", "value": 0.0, "comment": "Incorrect answer"} + ) -def run_evaluator_average_length(*, item_results, **kwargs): +def run_evaluator_average_length(*, item_results: List[ExperimentItemResult], **kwargs): """Run evaluator that calculates average output length.""" if not item_results: - return {"name": "average_length", "value": 0} + return Evaluation(**{"name": "average_length", "value": 0}) avg_length = sum(len(r["output"]) for r in item_results) / len(item_results) - return {"name": "average_length", "value": avg_length} + + return Evaluation(**{"name": "average_length", "value": avg_length}) # Basic Functionality Tests def test_run_experiment_on_local_dataset(sample_dataset): """Test running experiment on local dataset.""" langfuse_client = get_client() + result = langfuse_client.run_experiment( name="Euro capitals", description="Country capital experiment", @@ -139,12 +152,12 @@ def failing_evaluator(**kwargs): raise Exception("Evaluator failed") def working_evaluator(**kwargs): - return {"name": "working_eval", "value": 1.0} + return Evaluation(**{"name": "working_eval", "value": 1.0}) result = langfuse_client.run_experiment( name="Error test", data=[{"input": "test"}], - task=lambda x: "result", + task=lambda **kwargs: "result", evaluators=[working_evaluator, failing_evaluator], ) @@ -200,7 +213,7 @@ def failing_run_evaluator(**kwargs): result = langfuse_client.run_experiment( name="Run evaluator error test", data=[{"input": "test"}], - task=lambda x: "result", + task=lambda **kwargs: "result", run_evaluators=[failing_run_evaluator], ) @@ -220,7 +233,7 @@ def test_empty_dataset_handling(): result = langfuse_client.run_experiment( name="Empty dataset test", data=[], - task=lambda x: "result", + task=lambda **kwargs: "result", run_evaluators=[run_evaluator_average_length], ) @@ -244,7 +257,7 @@ def test_dataset_with_missing_fields(): result = langfuse_client.run_experiment( name="Incomplete data test", data=incomplete_dataset, - task=lambda x: "result", + task=lambda **kwargs: "result", ) # Should handle missing fields gracefully @@ -261,14 +274,14 @@ def test_large_dataset_with_concurrency(): """Test handling large dataset with concurrency control.""" langfuse_client = get_client() - large_dataset = [ + large_dataset: ExperimentData = [ {"input": f"Item {i}", "expected_output": f"Output {i}"} for i in range(20) ] result = langfuse_client.run_experiment( name="Large dataset test", data=large_dataset, - task=lambda x: f"Processed {x['input']}", + task=lambda **kwargs: f"Processed {kwargs['input']}", evaluators=[lambda **kwargs: {"name": "simple_eval", "value": 1.0}], max_concurrency=5, ) @@ -288,12 +301,14 @@ def test_single_evaluation_return(): langfuse_client = get_client() def single_evaluator(**kwargs): - return {"name": "single_eval", "value": 1, "comment": "Single evaluation"} + return Evaluation( + **{"name": "single_eval", "value": 1, "comment": "Single evaluation"} + ) result = langfuse_client.run_experiment( name="Single evaluation test", data=[{"input": "test"}], - task=lambda x: "result", + task=lambda **kwargs: "result", evaluators=[single_evaluator], ) @@ -312,8 +327,7 @@ def test_no_evaluators(): result = langfuse_client.run_experiment( name="No evaluators test", data=[{"input": "test"}], - task=lambda x: "result", - evaluators=[], + task=lambda **kwargs: "result", ) assert len(result["item_results"]) == 1 @@ -329,17 +343,18 @@ def test_only_run_evaluators(): langfuse_client = get_client() def run_only_evaluator(**kwargs): - return { - "name": "run_only_eval", - "value": 10, - "comment": "Run-level evaluation", - } + return Evaluation( + **{ + "name": "run_only_eval", + "value": 10, + "comment": "Run-level evaluation", + } + ) result = langfuse_client.run_experiment( name="Only run evaluators test", data=[{"input": "test"}], - task=lambda x: "result", - evaluators=[], + task=lambda **kwargs: "result", run_evaluators=[run_only_evaluator], ) @@ -357,18 +372,18 @@ def test_different_data_types(): langfuse_client = get_client() def number_evaluator(**kwargs): - return {"name": "number_eval", "value": 42} + return Evaluation(**{"name": "number_eval", "value": 42}) def string_evaluator(**kwargs): - return {"name": "string_eval", "value": "excellent"} + return Evaluation(**{"name": "string_eval", "value": "excellent"}) def boolean_evaluator(**kwargs): - return {"name": "boolean_eval", "value": True} + return Evaluation(**{"name": "boolean_eval", "value": True}) result = langfuse_client.run_experiment( name="Different data types test", data=[{"input": "test"}], - task=lambda x: "result", + task=lambda **kwargs: "result", evaluators=[number_evaluator, string_evaluator, boolean_evaluator], ) @@ -402,18 +417,22 @@ def test_scores_are_persisted(): dataset = langfuse_client.get_dataset(dataset_name) def test_evaluator(**kwargs): - return { - "name": "persistence_test", - "value": 0.85, - "comment": "Test evaluation for persistence", - } + return Evaluation( + **{ + "name": "persistence_test", + "value": 0.85, + "comment": "Test evaluation for persistence", + } + ) def test_run_evaluator(**kwargs): - return { - "name": "persistence_run_test", - "value": 0.9, - "comment": "Test run evaluation for persistence", - } + return Evaluation( + **{ + "name": "persistence_run_test", + "value": 0.9, + "comment": "Test run evaluation for persistence", + } + ) result = dataset.run_experiment( name="Score persistence test", @@ -506,7 +525,7 @@ def test_format_experiment_results_basic(): name="Formatting test", description="Test result formatting", data=[{"input": "Hello", "expected_output": "Hi"}], - task=lambda x: f"Processed: {x['input']}", + task=lambda **kwargs: f"Processed: {kwargs['input']}", evaluators=[simple_evaluator], run_evaluators=[run_evaluator_average_length], ) From f5f2cacc3783303f66559fb0a7cddd175e53f875 Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Thu, 11 Sep 2025 17:26:00 +0200 Subject: [PATCH 05/25] push --- langfuse/_client/client.py | 16 +++++++--------- langfuse/_client/experiments.py | 4 ++-- tests/test_experiments.py | 4 ++-- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py index b39ce8a0d..cccebb1b9 100644 --- a/langfuse/_client/client.py +++ b/langfuse/_client/client.py @@ -2670,15 +2670,13 @@ async def _process_experiment_item( try: from langfuse.model import CreateDatasetRunItemRequest - dataset_run_item = ( - await self.async_api.dataset_run_items.create( - request=CreateDatasetRunItemRequest( - runName=experiment_name, - runDescription=experiment_description, - metadata=experiment_metadata, - datasetItemId=item.id, # type: ignore - traceId=trace_id, - ) + dataset_run_item = self.api.dataset_run_items.create( + request=CreateDatasetRunItemRequest( + runName=experiment_name, + runDescription=experiment_description, + metadata=experiment_metadata, + datasetItemId=item.id, # type: ignore + traceId=trace_id, ) ) diff --git a/langfuse/_client/experiments.py b/langfuse/_client/experiments.py index 65aad2649..0a80e25ac 100644 --- a/langfuse/_client/experiments.py +++ b/langfuse/_client/experiments.py @@ -37,8 +37,8 @@ class LocalExperimentItem(TypedDict, total=False): metadata: Optional[Dict[str, Any]] -ExperimentItem = Union[LocalExperimentItem, DatasetItemClient] -ExperimentData = Union[List[LocalExperimentItem], List[DatasetItemClient]] +ExperimentItem = Union[LocalExperimentItem, "DatasetItemClient"] +ExperimentData = Union[List[LocalExperimentItem], List["DatasetItemClient"]] class Evaluation(TypedDict, total=False): diff --git a/tests/test_experiments.py b/tests/test_experiments.py index 2a20421ba..f9be524e3 100644 --- a/tests/test_experiments.py +++ b/tests/test_experiments.py @@ -281,7 +281,7 @@ def test_large_dataset_with_concurrency(): result = langfuse_client.run_experiment( name="Large dataset test", data=large_dataset, - task=lambda **kwargs: f"Processed {kwargs['input']}", + task=lambda **kwargs: f"Processed {kwargs['item']}", evaluators=[lambda **kwargs: {"name": "simple_eval", "value": 1.0}], max_concurrency=5, ) @@ -525,7 +525,7 @@ def test_format_experiment_results_basic(): name="Formatting test", description="Test result formatting", data=[{"input": "Hello", "expected_output": "Hi"}], - task=lambda **kwargs: f"Processed: {kwargs['input']}", + task=lambda **kwargs: f"Processed: {kwargs['item']}", evaluators=[simple_evaluator], run_evaluators=[run_evaluator_average_length], ) From ce290f5d8705c6e4952226552cbaf3c4b8df53e5 Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Thu, 11 Sep 2025 17:38:50 +0200 Subject: [PATCH 06/25] expand tests --- tests/test_experiments.py | 150 +++++++++++++++++++++++++++++++++++++- 1 file changed, 149 insertions(+), 1 deletion(-) diff --git a/tests/test_experiments.py b/tests/test_experiments.py index f9be524e3..c278243ab 100644 --- a/tests/test_experiments.py +++ b/tests/test_experiments.py @@ -96,6 +96,48 @@ def test_run_experiment_on_local_dataset(sample_dataset): langfuse_client.flush() time.sleep(2) + # Validate traces are correctly persisted with input/output/metadata + api = get_api() + expected_inputs = ["Germany", "France", "Spain"] + expected_outputs = ["Capital of Germany", "Capital of France", "Capital of Spain"] + + for i, item_result in enumerate(result["item_results"]): + trace_id = item_result["trace_id"] + assert trace_id is not None, f"Item {i} should have a trace_id" + + # Fetch trace from API + trace = api.trace.get(trace_id) + assert trace is not None, f"Trace {trace_id} should exist" + + # Validate trace name + assert ( + trace.name == "experiment-item-run" + ), f"Trace {trace_id} should have correct name" + + # Validate trace input - should contain the experiment item + assert trace.input is not None, f"Trace {trace_id} should have input" + expected_input = expected_inputs[i] + # The input should contain the item data in some form + assert expected_input in str( + trace.input + ), f"Trace {trace_id} input should contain '{expected_input}'" + + # Validate trace output - should be the task result + assert trace.output is not None, f"Trace {trace_id} should have output" + expected_output = expected_outputs[i] + assert ( + trace.output == expected_output + ), f"Trace {trace_id} output should be '{expected_output}', got '{trace.output}'" + + # Validate trace metadata contains experiment name + assert trace.metadata is not None, f"Trace {trace_id} should have metadata" + assert ( + "experiment_name" in trace.metadata + ), f"Trace {trace_id} metadata should contain experiment_name" + assert ( + trace.metadata["experiment_name"] == "Euro capitals" + ), f"Trace {trace_id} metadata should have correct experiment_name" + def test_run_experiment_on_langfuse_dataset(): """Test running experiment on Langfuse dataset.""" @@ -120,8 +162,10 @@ def test_run_experiment_on_langfuse_dataset(): # Get dataset and run experiment dataset = langfuse_client.get_dataset(dataset_name) + # Use unique experiment name for proper identification + experiment_name = "Dataset Test " + create_uuid()[:8] result = dataset.run_experiment( - name="Dataset Test", + name=experiment_name, description="Test on Langfuse dataset", task=mock_task, evaluators=[factuality_evaluator], @@ -142,6 +186,110 @@ def test_run_experiment_on_langfuse_dataset(): runs = api.datasets.get_runs(dataset_name) assert len(runs.data) >= 1 + # Validate traces are correctly persisted with input/output/metadata + expected_data = {"Germany": "Capital of Germany", "France": "Capital of France"} + dataset_run_id = result["dataset_run_id"] + + # Create a mapping from dataset item ID to dataset item for validation + dataset_item_map = {item.id: item for item in dataset.items} + + for i, item_result in enumerate(result["item_results"]): + trace_id = item_result["trace_id"] + assert trace_id is not None, f"Item {i} should have a trace_id" + + # Fetch trace from API + trace = api.trace.get(trace_id) + assert trace is not None, f"Trace {trace_id} should exist" + + # Validate trace name + assert ( + trace.name == "experiment-item-run" + ), f"Trace {trace_id} should have correct name" + + # Validate trace input and output match expected pairs + assert trace.input is not None, f"Trace {trace_id} should have input" + trace_input_str = str(trace.input) + + # Find which expected input this trace corresponds to + matching_input = None + for expected_input in expected_data.keys(): + if expected_input in trace_input_str: + matching_input = expected_input + break + + assert ( + matching_input is not None + ), f"Trace {trace_id} input '{trace_input_str}' should contain one of {list(expected_data.keys())}" + + # Validate trace output matches the expected output for this input + assert trace.output is not None, f"Trace {trace_id} should have output" + expected_output = expected_data[matching_input] + assert ( + trace.output == expected_output + ), f"Trace {trace_id} output should be '{expected_output}', got '{trace.output}'" + + # Validate trace metadata contains experiment and dataset info + assert trace.metadata is not None, f"Trace {trace_id} should have metadata" + assert ( + "experiment_name" in trace.metadata + ), f"Trace {trace_id} metadata should contain experiment_name" + assert ( + trace.metadata["experiment_name"] == experiment_name + ), f"Trace {trace_id} metadata should have correct experiment_name" + + # Validate dataset-specific metadata fields + assert ( + "dataset_id" in trace.metadata + ), f"Trace {trace_id} metadata should contain dataset_id" + assert ( + trace.metadata["dataset_id"] == dataset.id + ), f"Trace {trace_id} metadata should have correct dataset_id" + + assert ( + "dataset_item_id" in trace.metadata + ), f"Trace {trace_id} metadata should contain dataset_item_id" + # Get the dataset item ID from metadata and validate it exists + dataset_item_id = trace.metadata["dataset_item_id"] + assert ( + dataset_item_id in dataset_item_map + ), f"Trace {trace_id} metadata dataset_item_id should correspond to a valid dataset item" + + # Validate the dataset item input matches the trace input + dataset_item = dataset_item_map[dataset_item_id] + assert ( + dataset_item.input == matching_input + ), f"Trace {trace_id} should correspond to dataset item with input '{matching_input}'" + + # Verify dataset run contains the correct trace IDs + dataset_run = None + for run in runs.data: + if run.id == dataset_run_id: + dataset_run = run + break + + assert dataset_run is not None, f"Dataset run {dataset_run_id} should exist" + assert dataset_run.name == experiment_name, "Dataset run should have correct name" + assert ( + dataset_run.description == "Test on Langfuse dataset" + ), "Dataset run should have correct description" + + # Get dataset run items to verify trace linkage + dataset_run_items = api.dataset_run_items.list( + dataset_id=dataset.id, run_name=experiment_name + ) + assert len(dataset_run_items.data) == 2, "Dataset run should have 2 items" + + # Verify each dataset run item links to the correct trace + run_item_trace_ids = { + item.trace_id for item in dataset_run_items.data if item.trace_id + } + result_trace_ids = {item["trace_id"] for item in result["item_results"]} + + assert run_item_trace_ids == result_trace_ids, ( + f"Dataset run items should link to the same traces as experiment results. " + f"Run items: {run_item_trace_ids}, Results: {result_trace_ids}" + ) + # Error Handling Tests def test_evaluator_failures_handled_gracefully(): From 477a1c9abae9262f2061238bd139d628e15e151d Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Thu, 11 Sep 2025 17:58:43 +0200 Subject: [PATCH 07/25] expand docstrings --- langfuse/_client/client.py | 158 +++++++++-- langfuse/_client/datasets.py | 183 ++++++++++-- langfuse/_client/experiments.py | 473 ++++++++++++++++++++++++++++++-- langfuse/types.py | 55 +++- 4 files changed, 796 insertions(+), 73 deletions(-) diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py index cccebb1b9..678478d7d 100644 --- a/langfuse/_client/client.py +++ b/langfuse/_client/client.py @@ -2472,39 +2472,151 @@ def run_experiment( """Run an experiment on a dataset with automatic tracing and evaluation. This method executes a task function on each item in the provided dataset, - traces the execution with Langfuse, runs evaluators on the outputs, - and returns formatted results. + automatically traces all executions with Langfuse for observability, runs + item-level and run-level evaluators on the outputs, and returns comprehensive + results with evaluation metrics. + + The experiment system provides: + - Automatic tracing of all task executions + - Concurrent processing with configurable limits + - Comprehensive error handling that isolates failures + - Integration with Langfuse datasets for experiment tracking + - Flexible evaluation framework supporting both sync and async evaluators Args: - name: Human-readable name for the experiment - description: Optional description of the experiment's purpose - data: Array of data items to process (ExperimentItem or DatasetItem) - task: Function that processes each data item and returns output - evaluators: Optional list of functions to evaluate each item's output - run_evaluators: Optional list of functions to evaluate the entire experiment - max_concurrency: Maximum number of concurrent task executions - metadata: Optional metadata to attach to the experiment + name: Human-readable name for the experiment. Used for identification + in the Langfuse UI and for dataset run naming if using Langfuse datasets. + description: Optional description explaining the experiment's purpose, + methodology, or expected outcomes. + data: Array of data items to process. Can be either: + - List of dict-like items with 'input', 'expected_output', 'metadata' keys + - List of Langfuse DatasetItem objects from dataset.items + task: Function that processes each data item and returns output. + Must accept 'item' as keyword argument and can return sync or async results. + The task function signature should be: task(*, item, **kwargs) -> Any + evaluators: List of functions to evaluate each item's output individually. + Each evaluator receives input, output, expected_output, and metadata. + Can return single Evaluation dict or list of Evaluation dicts. + run_evaluators: List of functions to evaluate the entire experiment run. + Each run evaluator receives all item_results and can compute aggregate metrics. + Useful for calculating averages, distributions, or cross-item comparisons. + max_concurrency: Maximum number of concurrent task executions (default: 50). + Controls the number of items processed simultaneously. Adjust based on + API rate limits and system resources. + metadata: Optional metadata dictionary to attach to all experiment traces. + This metadata will be included in every trace created during the experiment. Returns: - ExperimentResult containing item results, evaluations, and formatting functions + ExperimentResult dictionary containing: + - item_results: List of results for each processed item with outputs and evaluations + - run_evaluations: List of aggregate evaluation results for the entire run + - dataset_run_id: ID of the dataset run (if using Langfuse datasets) + - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable) - Example: + Raises: + ValueError: If required parameters are missing or invalid + Exception: If experiment setup fails (individual item failures are handled gracefully) + + Examples: + Basic experiment with local data: ```python - def task(item): - return f"Processed: {item['input']}" + def summarize_text(*, item, **kwargs): + return f"Summary: {item['input'][:50]}..." - def evaluator(*, input, output, expected_output=None, **kwargs): - return {"name": "length", "value": len(output)} + def length_evaluator(*, input, output, expected_output=None, **kwargs): + return { + "name": "output_length", + "value": len(output), + "comment": f"Output contains {len(output)} characters" + } result = langfuse.run_experiment( - name="Test Experiment", - data=[{"input": "test", "expected_output": "expected"}], - task=task, - evaluators=[evaluator] + name="Text Summarization Test", + description="Evaluate summarization quality and length", + data=[ + {"input": "Long article text...", "expected_output": "Expected summary"}, + {"input": "Another article...", "expected_output": "Another summary"} + ], + task=summarize_text, + evaluators=[length_evaluator] ) - print(result["item_results"]) + print(f"Processed {len(result['item_results'])} items") + for item_result in result["item_results"]: + print(f"Input: {item_result['item']['input']}") + print(f"Output: {item_result['output']}") + print(f"Evaluations: {item_result['evaluations']}") ``` + + Advanced experiment with async task and multiple evaluators: + ```python + async def llm_task(*, item, **kwargs): + # Simulate async LLM call + response = await openai_client.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": item["input"]}] + ) + return response.choices[0].message.content + + def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): + if expected_output and expected_output.lower() in output.lower(): + return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"} + return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"} + + def toxicity_evaluator(*, input, output, expected_output=None, **kwargs): + # Simulate toxicity check + toxicity_score = check_toxicity(output) # Your toxicity checker + return { + "name": "toxicity", + "value": toxicity_score, + "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}" + } + + def average_accuracy(*, item_results, **kwargs): + accuracies = [ + eval["value"] for result in item_results + for eval in result["evaluations"] + if eval["name"] == "accuracy" + ] + return { + "name": "average_accuracy", + "value": sum(accuracies) / len(accuracies) if accuracies else 0, + "comment": f"Average accuracy across {len(accuracies)} items" + } + + result = langfuse.run_experiment( + name="LLM Safety and Accuracy Test", + description="Evaluate model accuracy and safety across diverse prompts", + data=test_dataset, # Your dataset items + task=llm_task, + evaluators=[accuracy_evaluator, toxicity_evaluator], + run_evaluators=[average_accuracy], + max_concurrency=5, # Limit concurrent API calls + metadata={"model": "gpt-4", "temperature": 0.7} + ) + ``` + + Using with Langfuse datasets: + ```python + # Get dataset from Langfuse + dataset = langfuse.get_dataset("my-eval-dataset") + + result = dataset.run_experiment( + name="Production Model Evaluation", + description="Monthly evaluation of production model performance", + task=my_production_task, + evaluators=[accuracy_evaluator, latency_evaluator] + ) + + # Results automatically linked to dataset in Langfuse UI + print(f"View results: {result['dataset_run_url']}") + ``` + + Note: + - Task and evaluator functions can be either synchronous or asynchronous + - Individual item failures are logged but don't stop the experiment + - All executions are automatically traced and visible in Langfuse UI + - When using Langfuse datasets, results are automatically linked for easy comparison """ return asyncio.run( self._run_experiment_async( @@ -2596,7 +2708,7 @@ async def process_item(item: ExperimentItem) -> dict: self.create_score( dataset_run_id=dataset_run_id, name=evaluation["name"], - value=evaluation["value"], + value=evaluation["value"], # type: ignore comment=evaluation.get("comment"), metadata=evaluation.get("metadata"), ) @@ -2718,7 +2830,7 @@ async def _process_experiment_item( self.create_score( trace_id=trace_id, name=evaluation.get("name", "unknown"), - value=evaluation.get("value", -1), + value=evaluation.get("value", -1), # type: ignore comment=evaluation.get("comment"), metadata=evaluation.get("metadata"), ) diff --git a/langfuse/_client/datasets.py b/langfuse/_client/datasets.py index af79520b1..cab0d98b6 100644 --- a/langfuse/_client/datasets.py +++ b/langfuse/_client/datasets.py @@ -206,41 +206,182 @@ def run_experiment( max_concurrency: int = 50, metadata: Optional[Dict[str, Any]] = None, ) -> Any: - """Run an experiment on this dataset. + """Run an experiment on this Langfuse dataset with automatic tracking. - This is a convenience method that calls the Langfuse client's run_experiment - method with this dataset's items as the data. + This is a convenience method that runs an experiment using all items in this + dataset. It automatically creates a dataset run in Langfuse for tracking and + comparison purposes, linking all experiment results to the dataset. + + Key benefits of using dataset.run_experiment(): + - Automatic dataset run creation and linking in Langfuse UI + - Built-in experiment tracking and versioning + - Easy comparison between different experiment runs + - Direct access to dataset items with their metadata and expected outputs + - Automatic URL generation for viewing results in Langfuse dashboard Args: - name: Human-readable name for the experiment - description: Optional description of the experiment's purpose - task: Function that processes each data item and returns output - evaluators: Optional list of functions to evaluate each item's output - run_evaluators: Optional list of functions to evaluate the entire experiment - max_concurrency: Maximum number of concurrent task executions - metadata: Optional metadata to attach to the experiment + name: Human-readable name for the experiment run. This will be used as + the dataset run name in Langfuse for tracking and identification. + description: Optional description of the experiment's purpose, methodology, + or what you're testing. Appears in the Langfuse UI for context. + task: Function that processes each dataset item and returns output. + The function will receive DatasetItem objects with .input, .expected_output, + .metadata attributes. Signature should be: task(*, item, **kwargs) -> Any + evaluators: List of functions to evaluate each item's output individually. + These will have access to the item's expected_output for comparison. + run_evaluators: List of functions to evaluate the entire experiment run. + Useful for computing aggregate statistics across all dataset items. + max_concurrency: Maximum number of concurrent task executions (default: 50). + Adjust based on API rate limits and system resources. + metadata: Optional metadata to attach to the experiment run and all traces. + Will be combined with individual item metadata. Returns: - ExperimentResult containing item results, evaluations, and formatting functions + ExperimentResult dictionary containing: + - item_results: Results for each dataset item with outputs and evaluations + - run_evaluations: Aggregate evaluation results for the entire run + - dataset_run_id: ID of the created dataset run in Langfuse + - dataset_run_url: Direct URL to view the experiment results in Langfuse UI + + Raises: + ValueError: If the dataset has no items or no Langfuse client is available - Example: + Examples: + Basic dataset experiment: ```python - dataset = langfuse.get_dataset("my-dataset") + dataset = langfuse.get_dataset("qa-evaluation-set") + + def answer_questions(*, item, **kwargs): + # item is a DatasetItem with .input, .expected_output, .metadata + question = item.input + return my_qa_system.answer(question) + + def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): + if not expected_output: + return {"name": "accuracy", "value": None, "comment": "No expected output"} + + is_correct = output.strip().lower() == expected_output.strip().lower() + return { + "name": "accuracy", + "value": 1.0 if is_correct else 0.0, + "comment": "Correct" if is_correct else "Incorrect" + } - def task(item): - return f"Processed: {item.input}" + result = dataset.run_experiment( + name="QA System v2.0 Evaluation", + description="Testing improved QA system on curated question set", + task=answer_questions, + evaluators=[accuracy_evaluator] + ) - def evaluator(*, input, output, expected_output=None, **kwargs): - return {"name": "length", "value": len(output)} + print(f"Evaluated {len(result['item_results'])} questions") + print(f"View detailed results: {result['dataset_run_url']}") + ``` + + Advanced experiment with multiple evaluators and run-level analysis: + ```python + dataset = langfuse.get_dataset("content-generation-benchmark") + + async def generate_content(*, item, **kwargs): + prompt = item.input + response = await openai_client.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": prompt}], + temperature=0.7 + ) + return response.choices[0].message.content + + def quality_evaluator(*, input, output, expected_output=None, metadata=None, **kwargs): + # Use metadata for context-aware evaluation + content_type = metadata.get("type", "general") if metadata else "general" + + # Basic quality checks + word_count = len(output.split()) + min_words = {"blog": 300, "tweet": 10, "summary": 100}.get(content_type, 50) + + return [ + { + "name": "word_count", + "value": word_count, + "comment": f"Generated {word_count} words" + }, + { + "name": "meets_length_requirement", + "value": word_count >= min_words, + "comment": f"{'Meets' if word_count >= min_words else 'Below'} minimum {min_words} words for {content_type}" + } + ] + + def content_diversity(*, item_results, **kwargs): + # Analyze diversity across all generated content + all_outputs = [result["output"] for result in item_results] + unique_words = set() + total_words = 0 + + for output in all_outputs: + words = output.lower().split() + unique_words.update(words) + total_words += len(words) + + diversity_ratio = len(unique_words) / total_words if total_words > 0 else 0 + + return { + "name": "vocabulary_diversity", + "value": diversity_ratio, + "comment": f"Used {len(unique_words)} unique words out of {total_words} total ({diversity_ratio:.2%} diversity)" + } result = dataset.run_experiment( - name="Dataset Test Experiment", - task=task, - evaluators=[evaluator] + name="Content Generation Diversity Test", + description="Evaluating content quality and vocabulary diversity across different content types", + task=generate_content, + evaluators=[quality_evaluator], + run_evaluators=[content_diversity], + max_concurrency=3, # Limit API calls + metadata={"model": "gpt-4", "temperature": 0.7} + ) + + # Results are automatically linked to dataset in Langfuse + print(f"Experiment completed! View in Langfuse: {result['dataset_run_url']}") + + # Access individual results + for i, item_result in enumerate(result["item_results"]): + print(f"Item {i+1}: {item_result['evaluations']}") + ``` + + Comparing different model versions: + ```python + # Run multiple experiments on the same dataset for comparison + dataset = langfuse.get_dataset("model-benchmark") + + # Experiment 1: GPT-4 + result_gpt4 = dataset.run_experiment( + name="GPT-4 Baseline", + description="Baseline performance with GPT-4", + task=lambda *, item, **kwargs: gpt4_model.generate(item.input), + evaluators=[accuracy_evaluator, fluency_evaluator] ) - print(result["item_results"]) + # Experiment 2: Custom model + result_custom = dataset.run_experiment( + name="Custom Model v1.2", + description="Testing our fine-tuned model", + task=lambda *, item, **kwargs: custom_model.generate(item.input), + evaluators=[accuracy_evaluator, fluency_evaluator] + ) + + # Both experiments are now visible in Langfuse for easy comparison + print("Compare results in Langfuse:") + print(f"GPT-4: {result_gpt4['dataset_run_url']}") + print(f"Custom: {result_custom['dataset_run_url']}") ``` + + Note: + - All experiment results are automatically tracked in Langfuse as dataset runs + - Dataset items provide .input, .expected_output, and .metadata attributes + - Results can be easily compared across different experiment runs in the UI + - The dataset_run_url provides direct access to detailed results and analysis + - Failed items are handled gracefully and logged without stopping the experiment """ langfuse_client = self._get_langfuse_client() if not langfuse_client: diff --git a/langfuse/_client/experiments.py b/langfuse/_client/experiments.py index 0a80e25ac..8628da489 100644 --- a/langfuse/_client/experiments.py +++ b/langfuse/_client/experiments.py @@ -24,12 +24,49 @@ class LocalExperimentItem(TypedDict, total=False): - """Structure for experiment data items. - - Args: - input: The input data to pass to the task function - expected_output: Optional expected output for evaluation purposes - metadata: Optional metadata for the experiment item + """Structure for local experiment data items (not from Langfuse datasets). + + This TypedDict defines the structure for experiment items when using local data + rather than Langfuse-hosted datasets. All fields are optional to provide + flexibility in data structure. + + Attributes: + input: The input data to pass to the task function. Can be any type that + your task function can process (string, dict, list, etc.). This is + typically the prompt, question, or data that your task will operate on. + expected_output: Optional expected/ground truth output for evaluation purposes. + Used by evaluators to assess correctness or quality. Can be None if + no ground truth is available. + metadata: Optional metadata dictionary containing additional context about + this specific item. Can include information like difficulty level, + category, source, or any other relevant attributes that evaluators + might use for context-aware evaluation. + + Examples: + Simple text processing item: + ```python + item: LocalExperimentItem = { + "input": "Summarize this article: ...", + "expected_output": "Expected summary...", + "metadata": {"difficulty": "medium", "category": "news"} + } + ``` + + Classification item: + ```python + item: LocalExperimentItem = { + "input": {"text": "This movie is great!", "context": "movie review"}, + "expected_output": "positive", + "metadata": {"dataset_source": "imdb", "confidence": 0.95} + } + ``` + + Minimal item with only input: + ```python + item: LocalExperimentItem = { + "input": "What is the capital of France?" + } + ``` """ input: Any @@ -38,21 +75,88 @@ class LocalExperimentItem(TypedDict, total=False): ExperimentItem = Union[LocalExperimentItem, "DatasetItemClient"] +"""Type alias for items that can be processed in experiments. + +Can be either: +- LocalExperimentItem: Dict-like items with 'input', 'expected_output', 'metadata' keys +- DatasetItemClient: Items from Langfuse datasets with .input, .expected_output, .metadata attributes +""" + ExperimentData = Union[List[LocalExperimentItem], List["DatasetItemClient"]] +"""Type alias for experiment datasets. +Represents the collection of items to process in an experiment. Can be either: +- List[LocalExperimentItem]: Local data items as dictionaries +- List[DatasetItemClient]: Items from a Langfuse dataset (typically from dataset.items) +""" -class Evaluation(TypedDict, total=False): - """Structure for evaluation results. - Args: - name: Name of the evaluation metric - value: The evaluation score/value (numeric or string) - comment: Optional comment explaining the evaluation - metadata: Optional metadata for the evaluation +class Evaluation(TypedDict, total=False): + """Structure for evaluation results returned by evaluator functions. + + This TypedDict defines the standardized format that all evaluator functions + must return. It provides a consistent structure for storing evaluation metrics + and their metadata across different types of evaluators. + + Attributes: + name: Unique identifier for the evaluation metric. Should be descriptive + and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). + Used for aggregation and comparison across experiment runs. + value: The evaluation score or result. Can be: + - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42) + - String: For categorical results like "positive", "negative", "neutral" + - Boolean: For binary assessments like "passes_safety_check" + - None: When evaluation cannot be computed (missing data, API errors, etc.) + comment: Optional human-readable explanation of the evaluation result. + Useful for providing context, explaining scoring rationale, or noting + special conditions. Displayed in Langfuse UI for interpretability. + metadata: Optional structured metadata about the evaluation process. + Can include confidence scores, intermediate calculations, model versions, + or any other relevant technical details. + + Examples: + Quantitative accuracy evaluation: + ```python + accuracy_result: Evaluation = { + "name": "accuracy", + "value": 0.85, + "comment": "85% of responses were correct", + "metadata": {"total_items": 100, "correct_items": 85} + } + ``` + + Qualitative assessment: + ```python + sentiment_result: Evaluation = { + "name": "sentiment", + "value": "positive", + "comment": "Response expresses optimistic viewpoint", + "metadata": {"confidence": 0.92, "model": "sentiment-analyzer-v2"} + } + ``` + + Binary check: + ```python + safety_result: Evaluation = { + "name": "safety_check", + "value": True, + "comment": "Content passes all safety filters" + } + ``` + + Failed evaluation: + ```python + failed_result: Evaluation = { + "name": "external_api_score", + "value": None, + "comment": "External API unavailable", + "metadata": {"error": "timeout", "retry_count": 3} + } + ``` """ name: str - value: Union[int, float, str, bool] + value: Union[int, float, str, bool, None] comment: Optional[str] metadata: Optional[Dict[str, Any]] @@ -92,7 +196,18 @@ class ExperimentResult(TypedDict): class TaskFunction(Protocol): - """Protocol for experiment task functions.""" + """Protocol defining the interface for experiment task functions. + + Task functions are the core processing functions that operate on each item + in an experiment dataset. They receive an experiment item as input and + produce some output that will be evaluated. + + Task functions must: + - Accept 'item' as a keyword argument + - Return any type of output (will be passed to evaluators) + - Can be either synchronous or asynchronous + - Should handle their own errors gracefully (exceptions will be logged) + """ def __call__( self, @@ -102,17 +217,72 @@ def __call__( ) -> Union[Any, Awaitable[Any]]: """Execute the task on an experiment item. + This method defines the core processing logic for each item in your experiment. + The implementation should focus on the specific task you want to evaluate, + such as text generation, classification, summarization, etc. + Args: - item: The experiment or dataset item to process + item: The experiment item to process. Can be either: + - Dict with keys like 'input', 'expected_output', 'metadata' + - Langfuse DatasetItem object with .input, .expected_output attributes + **kwargs: Additional keyword arguments that may be passed by the framework Returns: - The task output (can be sync or async) + Any: The output of processing the item. This output will be: + - Stored in the experiment results + - Passed to all item-level evaluators for assessment + - Traced automatically in Langfuse for observability + + Can return either a direct value or an awaitable (async) result. + + Examples: + Simple synchronous task: + ```python + def my_task(*, item, **kwargs): + prompt = f"Summarize: {item['input']}" + return my_llm_client.generate(prompt) + ``` + + Async task with error handling: + ```python + async def my_async_task(*, item, **kwargs): + try: + response = await openai_client.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": item["input"]}] + ) + return response.choices[0].message.content + except Exception as e: + # Log error and return fallback + print(f"Task failed for item {item}: {e}") + return "Error: Could not process item" + ``` + + Task using dataset item attributes: + ```python + def classification_task(*, item, **kwargs): + # Works with both dict items and DatasetItem objects + text = item["input"] if isinstance(item, dict) else item.input + return classify_text(text) + ``` """ ... class EvaluatorFunction(Protocol): - """Protocol for item-level evaluator functions.""" + """Protocol defining the interface for item-level evaluator functions. + + Item-level evaluators assess the quality, correctness, or other properties + of individual task outputs. They receive the input, output, expected output, + and metadata for each item and return evaluation metrics. + + Evaluators should: + - Accept input, output, expected_output, and metadata as keyword arguments + - Return Evaluation dict(s) with 'name', 'value', 'comment', 'metadata' fields + - Be deterministic when possible for reproducible results + - Handle edge cases gracefully (missing expected output, malformed data, etc.) + - Can be either synchronous or asynchronous + """ def __call__( self, @@ -125,22 +295,134 @@ def __call__( ) -> Union[ Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]] ]: - """Evaluate a task output. + """Evaluate a task output for quality, correctness, or other metrics. + + This method should implement specific evaluation logic such as accuracy checking, + similarity measurement, toxicity detection, fluency assessment, etc. Args: - input: The original input to the task - output: The output produced by the task - expected_output: The expected output (if available) - metadata: Optional metadata from the experiment item + input: The original input that was passed to the task function. + This is typically the item['input'] or item.input value. + output: The output produced by the task function for this input. + This is the direct return value from your task function. + expected_output: The expected/ground truth output for comparison. + May be None if not available in the dataset. Evaluators should + handle this case appropriately. + metadata: Optional metadata from the experiment item that might + contain additional context for evaluation (categories, difficulty, etc.) + **kwargs: Additional keyword arguments that may be passed by the framework Returns: - Single evaluation or list of evaluations (can be sync or async) + Evaluation results in one of these formats: + - Single Evaluation dict: {"name": "accuracy", "value": 0.85, "comment": "..."} + - List of Evaluation dicts: [{"name": "precision", ...}, {"name": "recall", ...}] + - Awaitable returning either of the above (for async evaluators) + + Each Evaluation dict should contain: + - name (str): Unique identifier for this evaluation metric + - value (int|float|str|bool): The evaluation score or result + - comment (str, optional): Human-readable explanation of the result + - metadata (dict, optional): Additional structured data about the evaluation + + Examples: + Simple accuracy evaluator: + ```python + def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): + if expected_output is None: + return {"name": "accuracy", "value": None, "comment": "No expected output"} + + is_correct = output.strip().lower() == expected_output.strip().lower() + return { + "name": "accuracy", + "value": 1.0 if is_correct else 0.0, + "comment": "Exact match" if is_correct else "No match" + } + ``` + + Multi-metric evaluator: + ```python + def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs): + results = [] + + # Length check + results.append({ + "name": "output_length", + "value": len(output), + "comment": f"Output contains {len(output)} characters" + }) + + # Sentiment analysis + sentiment_score = analyze_sentiment(output) + results.append({ + "name": "sentiment", + "value": sentiment_score, + "comment": f"Sentiment score: {sentiment_score:.2f}" + }) + + return results + ``` + + Async evaluator using external API: + ```python + async def llm_judge_evaluator(*, input, output, expected_output=None, **kwargs): + prompt = f"Rate the quality of this response on a scale of 1-10:\n" + prompt += f"Question: {input}\nResponse: {output}" + + response = await openai_client.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": prompt}] + ) + + try: + score = float(response.choices[0].message.content.strip()) + return { + "name": "llm_judge_quality", + "value": score, + "comment": f"LLM judge rated this {score}/10" + } + except ValueError: + return { + "name": "llm_judge_quality", + "value": None, + "comment": "Could not parse LLM judge score" + } + ``` + + Context-aware evaluator: + ```python + def context_evaluator(*, input, output, metadata=None, **kwargs): + # Use metadata for context-specific evaluation + difficulty = metadata.get("difficulty", "medium") if metadata else "medium" + + # Adjust expectations based on difficulty + min_length = {"easy": 50, "medium": 100, "hard": 150}[difficulty] + + meets_requirement = len(output) >= min_length + return { + "name": f"meets_{difficulty}_requirement", + "value": meets_requirement, + "comment": f"Output {'meets' if meets_requirement else 'fails'} {difficulty} length requirement" + } + ``` """ ... class RunEvaluatorFunction(Protocol): - """Protocol for run-level evaluator functions.""" + """Protocol defining the interface for run-level evaluator functions. + + Run-level evaluators assess aggregate properties of the entire experiment run, + computing metrics that span across all items rather than individual outputs. + They receive the complete results from all processed items and can compute + statistics like averages, distributions, correlations, or other aggregate metrics. + + Run evaluators should: + - Accept item_results as a keyword argument containing all item results + - Return Evaluation dict(s) with aggregate metrics + - Handle cases where some items may have failed processing + - Compute meaningful statistics across the dataset + - Can be either synchronous or asynchronous + """ def __call__( self, @@ -150,13 +432,148 @@ def __call__( ) -> Union[ Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]] ]: - """Evaluate the entire experiment run. + """Evaluate the entire experiment run with aggregate metrics. + + This method should implement aggregate evaluation logic such as computing + averages, calculating distributions, finding correlations, detecting patterns + across items, or performing statistical analysis on the experiment results. Args: - item_results: Results from all processed experiment items + item_results: List of results from all successfully processed experiment items. + Each item result contains: + - item: The original experiment item + - output: The task function's output for this item + - evaluations: List of item-level evaluation results + - trace_id: Langfuse trace ID for this execution + - dataset_run_id: Dataset run ID (if using Langfuse datasets) + + Note: This list only includes items that were successfully processed. + Failed items are excluded but logged separately. + **kwargs: Additional keyword arguments that may be passed by the framework Returns: - Single evaluation or list of evaluations (can be sync or async) + Evaluation results in one of these formats: + - Single Evaluation dict: {"name": "avg_accuracy", "value": 0.78, "comment": "..."} + - List of Evaluation dicts: [{"name": "mean", ...}, {"name": "std_dev", ...}] + - Awaitable returning either of the above (for async evaluators) + + Each Evaluation dict should contain: + - name (str): Unique identifier for this run-level metric + - value (int|float|str|bool): The aggregate evaluation result + - comment (str, optional): Human-readable explanation of the metric + - metadata (dict, optional): Additional structured data about the evaluation + + Examples: + Average accuracy calculator: + ```python + def average_accuracy(*, item_results, **kwargs): + if not item_results: + return {"name": "avg_accuracy", "value": 0.0, "comment": "No results"} + + accuracy_values = [] + for result in item_results: + for evaluation in result["evaluations"]: + if evaluation["name"] == "accuracy": + accuracy_values.append(evaluation["value"]) + + if not accuracy_values: + return {"name": "avg_accuracy", "value": None, "comment": "No accuracy evaluations found"} + + avg = sum(accuracy_values) / len(accuracy_values) + return { + "name": "avg_accuracy", + "value": avg, + "comment": f"Average accuracy across {len(accuracy_values)} items: {avg:.2%}" + } + ``` + + Multiple aggregate metrics: + ```python + def statistical_summary(*, item_results, **kwargs): + if not item_results: + return [] + + results = [] + + # Calculate output length statistics + lengths = [len(str(result["output"])) for result in item_results] + results.extend([ + {"name": "avg_output_length", "value": sum(lengths) / len(lengths)}, + {"name": "min_output_length", "value": min(lengths)}, + {"name": "max_output_length", "value": max(lengths)} + ]) + + # Success rate + total_items = len(item_results) # Only successful items are included + results.append({ + "name": "processing_success_rate", + "value": 1.0, # All items in item_results succeeded + "comment": f"Successfully processed {total_items} items" + }) + + return results + ``` + + Async run evaluator with external analysis: + ```python + async def llm_batch_analysis(*, item_results, **kwargs): + # Prepare batch analysis prompt + outputs = [result["output"] for result in item_results] + prompt = f"Analyze these {len(outputs)} outputs for common themes:\n" + prompt += "\n".join(f"{i+1}. {output}" for i, output in enumerate(outputs)) + + response = await openai_client.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": prompt}] + ) + + return { + "name": "thematic_analysis", + "value": response.choices[0].message.content, + "comment": f"LLM analysis of {len(outputs)} outputs" + } + ``` + + Performance distribution analysis: + ```python + def performance_distribution(*, item_results, **kwargs): + # Extract all evaluation scores + all_scores = [] + score_by_metric = {} + + for result in item_results: + for evaluation in result["evaluations"]: + metric_name = evaluation["name"] + value = evaluation["value"] + + if isinstance(value, (int, float)): + all_scores.append(value) + if metric_name not in score_by_metric: + score_by_metric[metric_name] = [] + score_by_metric[metric_name].append(value) + + results = [] + + # Overall score distribution + if all_scores: + import statistics + results.append({ + "name": "score_std_dev", + "value": statistics.stdev(all_scores) if len(all_scores) > 1 else 0, + "comment": f"Standard deviation across all numeric scores" + }) + + # Per-metric statistics + for metric, scores in score_by_metric.items(): + if len(scores) > 1: + results.append({ + "name": f"{metric}_variance", + "value": statistics.variance(scores), + "comment": f"Variance in {metric} across {len(scores)} items" + }) + + return results + ``` """ ... diff --git a/langfuse/types.py b/langfuse/types.py index b654fffed..8a186a345 100644 --- a/langfuse/types.py +++ b/langfuse/types.py @@ -1,4 +1,21 @@ -"""@private""" +"""Public API for all Langfuse types. + +This module provides a centralized location for importing commonly used types +from the Langfuse SDK, making them easily accessible without requiring nested imports. + +Example: + ```python + from langfuse.types import Evaluation, LocalExperimentItem, TaskFunction + + # Define your task function + def my_task(*, item: LocalExperimentItem, **kwargs) -> str: + return f"Processed: {item['input']}" + + # Define your evaluator + def my_evaluator(*, output: str, **kwargs) -> Evaluation: + return {"name": "length", "value": len(output)} + ``` +""" from datetime import datetime from typing import ( @@ -22,6 +39,19 @@ from langfuse.api import MediaContentType, UsageDetails from langfuse.model import MapValue, ModelUsage, PromptClient +# Experiment types +from ._client.experiments import ( + LocalExperimentItem, + ExperimentItem, + ExperimentData, + Evaluation, + ExperimentItemResult, + ExperimentResult, + TaskFunction, + EvaluatorFunction, + RunEvaluatorFunction, +) + SpanLevel = Literal["DEBUG", "DEFAULT", "WARNING", "ERROR"] ScoreDataType = Literal["NUMERIC", "CATEGORICAL", "BOOLEAN"] @@ -84,3 +114,26 @@ class ParsedMediaReference(TypedDict): class TraceContext(TypedDict): trace_id: str parent_span_id: NotRequired[str] + + +# Export experiment types for easy access +__all__ = [ + # Experiment types + "LocalExperimentItem", + "ExperimentItem", + "ExperimentData", + "Evaluation", + "ExperimentItemResult", + "ExperimentResult", + "TaskFunction", + "EvaluatorFunction", + "RunEvaluatorFunction", + # Core types (keeping existing functionality) + "SpanLevel", + "ScoreDataType", + "TraceMetadata", + "ObservationParams", + "MaskFunction", + "ParsedMediaReference", + "TraceContext", +] From b8b2f8c304aafca556e89ff1da51135c795a2cb6 Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Thu, 11 Sep 2025 18:16:52 +0200 Subject: [PATCH 08/25] add run safe async --- langfuse/_client/client.py | 28 ++-- langfuse/_client/datasets.py | 2 + langfuse/_client/utils.py | 69 +++++++++- tests/test_utils.py | 254 +++++++++++++++++++++++++++++++++++ 4 files changed, 341 insertions(+), 12 deletions(-) create mode 100644 tests/test_utils.py diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py index 6d17ba0dc..b51402951 100644 --- a/langfuse/_client/client.py +++ b/langfuse/_client/client.py @@ -80,6 +80,7 @@ LangfuseSpan, LangfuseTool, ) +from langfuse._client.utils import run_async_safely from langfuse._utils import _get_timestamp from langfuse._utils.parse_error import handle_fern_exception from langfuse._utils.prompt_cache import PromptCache @@ -2617,18 +2618,23 @@ def average_accuracy(*, item_results, **kwargs): - Individual item failures are logged but don't stop the experiment - All executions are automatically traced and visible in Langfuse UI - When using Langfuse datasets, results are automatically linked for easy comparison + - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.) + - Async execution is handled automatically with smart event loop detection """ - return asyncio.run( - self._run_experiment_async( - name=name, - description=description, - data=data, - task=task, - evaluators=evaluators or [], - run_evaluators=run_evaluators or [], - max_concurrency=max_concurrency, - metadata=metadata or {}, - ) + return cast( + ExperimentResult, + run_async_safely( + self._run_experiment_async( + name=name, + description=description, + data=data, + task=task, + evaluators=evaluators or [], + run_evaluators=run_evaluators or [], + max_concurrency=max_concurrency, + metadata=metadata or {}, + ), + ), ) async def _run_experiment_async( diff --git a/langfuse/_client/datasets.py b/langfuse/_client/datasets.py index cab0d98b6..023b7f947 100644 --- a/langfuse/_client/datasets.py +++ b/langfuse/_client/datasets.py @@ -382,6 +382,8 @@ def content_diversity(*, item_results, **kwargs): - Results can be easily compared across different experiment runs in the UI - The dataset_run_url provides direct access to detailed results and analysis - Failed items are handled gracefully and logged without stopping the experiment + - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.) + - Async execution is handled automatically with smart event loop detection """ langfuse_client = self._get_langfuse_client() if not langfuse_client: diff --git a/langfuse/_client/utils.py b/langfuse/_client/utils.py index dac7a3f1b..d34857ebd 100644 --- a/langfuse/_client/utils.py +++ b/langfuse/_client/utils.py @@ -1,10 +1,13 @@ """Utility functions for Langfuse OpenTelemetry integration. This module provides utility functions for working with OpenTelemetry spans, -including formatting and serialization of span data. +including formatting and serialization of span data, and async execution helpers. """ +import asyncio import json +import threading +from typing import Any, Coroutine from opentelemetry import trace as otel_trace_api from opentelemetry.sdk import util @@ -58,3 +61,67 @@ def span_formatter(span: ReadableSpan) -> str: ) + "\n" ) + + +class _RunAsyncThread(threading.Thread): + """Helper thread class for running async coroutines in a separate thread.""" + + def __init__(self, coro: Coroutine[Any, Any, Any]) -> None: + self.coro = coro + self.result: Any = None + self.exception: Exception | None = None + super().__init__() + + def run(self) -> None: + try: + self.result = asyncio.run(self.coro) + except Exception as e: + self.exception = e + + +def run_async_safely(coro: Coroutine[Any, Any, Any]) -> Any: + """Safely run an async coroutine, handling existing event loops. + + This function detects if there's already a running event loop and uses + a separate thread if needed to avoid the "asyncio.run() cannot be called + from a running event loop" error. This is particularly useful in environments + like Jupyter notebooks, FastAPI applications, or other async frameworks. + + Args: + coro: The coroutine to run + + Returns: + The result of the coroutine + + Raises: + Any exception raised by the coroutine + + Example: + ```python + # Works in both sync and async contexts + async def my_async_function(): + await asyncio.sleep(1) + return "done" + + result = run_async_safely(my_async_function()) + ``` + """ + try: + # Check if there's already a running event loop + loop = asyncio.get_running_loop() + except RuntimeError: + # No running loop, safe to use asyncio.run() + return asyncio.run(coro) + + if loop and loop.is_running(): + # There's a running loop, use a separate thread + thread = _RunAsyncThread(coro) + thread.start() + thread.join() + + if thread.exception: + raise thread.exception + return thread.result + else: + # Loop exists but not running, safe to use asyncio.run() + return asyncio.run(coro) diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 000000000..ac3ee8473 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,254 @@ +"""Test suite for utility functions in langfuse._client.utils module.""" + +import asyncio +import threading +from unittest import mock + +import pytest + +from langfuse._client.utils import run_async_safely + + +class TestRunAsyncSafely: + """Test suite for the run_async_safely function.""" + + def test_run_sync_context_simple(self): + """Test run_async_safely in sync context with simple coroutine.""" + + async def simple_coro(): + await asyncio.sleep(0.01) + return "hello" + + result = run_async_safely(simple_coro()) + assert result == "hello" + + def test_run_sync_context_with_value(self): + """Test run_async_safely in sync context with parameter passing.""" + + async def coro_with_params(value, multiplier=2): + await asyncio.sleep(0.01) + return value * multiplier + + result = run_async_safely(coro_with_params(5, multiplier=3)) + assert result == 15 + + def test_run_sync_context_with_exception(self): + """Test run_async_safely properly propagates exceptions in sync context.""" + + async def failing_coro(): + await asyncio.sleep(0.01) + raise ValueError("Test error") + + with pytest.raises(ValueError, match="Test error"): + run_async_safely(failing_coro()) + + @pytest.mark.asyncio + async def test_run_async_context_simple(self): + """Test run_async_safely from within async context (uses threading).""" + + async def simple_coro(): + await asyncio.sleep(0.01) + return "from_thread" + + # This should use threading since we're already in an async context + result = run_async_safely(simple_coro()) + assert result == "from_thread" + + @pytest.mark.asyncio + async def test_run_async_context_with_exception(self): + """Test run_async_safely properly propagates exceptions from thread.""" + + async def failing_coro(): + await asyncio.sleep(0.01) + raise RuntimeError("Thread error") + + with pytest.raises(RuntimeError, match="Thread error"): + run_async_safely(failing_coro()) + + @pytest.mark.asyncio + async def test_run_async_context_thread_isolation(self): + """Test that threaded execution is properly isolated.""" + # Set a thread-local value in the main async context + threading.current_thread().test_value = "main_thread" + + async def check_thread_isolation(): + # This should run in a different thread + current_thread = threading.current_thread() + # Should not have the test_value from main thread + assert not hasattr(current_thread, "test_value") + return "isolated" + + result = run_async_safely(check_thread_isolation()) + assert result == "isolated" + + def test_multiple_calls_sync_context(self): + """Test multiple sequential calls in sync context.""" + + async def counter_coro(count): + await asyncio.sleep(0.001) + return count * 2 + + results = [] + for i in range(5): + result = run_async_safely(counter_coro(i)) + results.append(result) + + assert results == [0, 2, 4, 6, 8] + + @pytest.mark.asyncio + async def test_multiple_calls_async_context(self): + """Test multiple sequential calls in async context (each uses threading).""" + + async def counter_coro(count): + await asyncio.sleep(0.001) + return count * 3 + + results = [] + for i in range(3): + result = run_async_safely(counter_coro(i)) + results.append(result) + + assert results == [0, 3, 6] + + def test_concurrent_calls_sync_context(self): + """Test concurrent calls in sync context using threading.""" + + async def slow_coro(value): + await asyncio.sleep(0.02) + return value**2 + + import concurrent.futures + + with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: + futures = [] + for i in range(3): + future = executor.submit(run_async_safely, slow_coro(i + 1)) + futures.append(future) + + results = [future.result() for future in futures] + + # Results should be squares: 1^2, 2^2, 3^2 + assert sorted(results) == [1, 4, 9] + + def test_event_loop_detection_mock(self): + """Test event loop detection logic with mocking.""" + + async def simple_coro(): + return "mocked" + + # Mock no running loop - should use asyncio.run + with mock.patch( + "asyncio.get_running_loop", side_effect=RuntimeError("No loop") + ): + with mock.patch( + "asyncio.run", return_value="asyncio_run_called" + ) as mock_run: + result = run_async_safely(simple_coro()) + assert result == "asyncio_run_called" + mock_run.assert_called_once() + + def test_complex_coroutine(self): + """Test with a more complex coroutine that does actual async work.""" + + async def complex_coro(): + # Simulate some async operations + results = [] + for i in range(3): + await asyncio.sleep(0.001) + results.append(i**2) + + # Simulate concurrent operations + async def sub_task(x): + await asyncio.sleep(0.001) + return x * 10 + + tasks = [sub_task(x) for x in range(2)] + concurrent_results = await asyncio.gather(*tasks) + results.extend(concurrent_results) + + return results + + result = run_async_safely(complex_coro()) + assert result == [0, 1, 4, 0, 10] # [0^2, 1^2, 2^2, 0*10, 1*10] + + @pytest.mark.asyncio + async def test_nested_async_calls(self): + """Test that nested calls to run_async_safely work correctly.""" + + async def inner_coro(value): + await asyncio.sleep(0.001) + return value * 2 + + async def outer_coro(value): + # This is already in an async context, so the inner call + # will also use threading + inner_result = run_async_safely(inner_coro(value)) + await asyncio.sleep(0.001) + return inner_result + 1 + + result = run_async_safely(outer_coro(5)) + assert result == 11 # (5 * 2) + 1 + + def test_exception_types_preserved(self): + """Test that different exception types are properly preserved.""" + + async def custom_exception_coro(): + await asyncio.sleep(0.001) + + class CustomError(Exception): + pass + + raise CustomError("Custom error message") + + with pytest.raises(Exception) as exc_info: + run_async_safely(custom_exception_coro()) + + # The exception type should be preserved + assert "Custom error message" in str(exc_info.value) + + def test_return_types_preserved(self): + """Test that various return types are properly preserved.""" + + async def dict_coro(): + await asyncio.sleep(0.001) + return {"key": "value", "number": 42} + + async def list_coro(): + await asyncio.sleep(0.001) + return [1, 2, 3, "string"] + + async def none_coro(): + await asyncio.sleep(0.001) + return None + + dict_result = run_async_safely(dict_coro()) + assert dict_result == {"key": "value", "number": 42} + assert isinstance(dict_result, dict) + + list_result = run_async_safely(list_coro()) + assert list_result == [1, 2, 3, "string"] + assert isinstance(list_result, list) + + none_result = run_async_safely(none_coro()) + assert none_result is None + + @pytest.mark.asyncio + async def test_real_world_scenario_jupyter_simulation(self): + """Test scenario simulating Jupyter notebook environment.""" + # This simulates being called from a Jupyter cell where there's + # already an event loop running + + async def simulate_llm_call(prompt): + """Simulate an LLM API call.""" + await asyncio.sleep(0.01) # Simulate network delay + return f"Response to: {prompt}" + + async def simulate_experiment_task(item): + """Simulate an experiment task function.""" + response = await simulate_llm_call(item["input"]) + await asyncio.sleep(0.001) # Additional processing + return response + + # This should work even though we're in an async context + result = run_async_safely(simulate_experiment_task({"input": "test prompt"})) + assert result == "Response to: test prompt" From db09d7fa7ecda28c75ee213d30137d94efe57f16 Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Fri, 12 Sep 2025 10:29:15 +0200 Subject: [PATCH 09/25] push --- langfuse/_client/client.py | 3 +++ langfuse/_client/experiments.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py index b51402951..27958b967 100644 --- a/langfuse/_client/client.py +++ b/langfuse/_client/client.py @@ -2722,6 +2722,9 @@ async def process_item(item: ExperimentItem) -> dict: except Exception as e: langfuse_logger.error(f"Failed to store run evaluation: {e}") + # Flush scores and traces + self.flush() + return { "item_results": valid_results, "run_evaluations": run_evaluations, diff --git a/langfuse/_client/experiments.py b/langfuse/_client/experiments.py index 8628da489..5833af70a 100644 --- a/langfuse/_client/experiments.py +++ b/langfuse/_client/experiments.py @@ -289,8 +289,8 @@ def __call__( *, input: Any, output: Any, - expected_output: Any = None, - metadata: Optional[Dict[str, Any]] = None, + expected_output: Any, + metadata: Optional[Dict[str, Any]], **kwargs: Dict[str, Any], ) -> Union[ Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]] From 285cc99d5f9eadc4b576cf395371c855a6215fb2 Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Fri, 12 Sep 2025 10:31:45 +0200 Subject: [PATCH 10/25] push --- langfuse/_task_manager/media_manager.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/langfuse/_task_manager/media_manager.py b/langfuse/_task_manager/media_manager.py index a36e3b8af..1a32e3d60 100644 --- a/langfuse/_task_manager/media_manager.py +++ b/langfuse/_task_manager/media_manager.py @@ -49,7 +49,6 @@ def process_next_media_upload(self) -> None: self._queue.task_done() except Empty: - self._log.debug("Queue: Media upload queue is empty, waiting for new jobs") pass except Exception as e: self._log.error( @@ -248,7 +247,7 @@ def _process_upload_media_job( headers = {"Content-Type": data["content_type"]} - # In self-hosted setups with GCP, do not add unsupported headers that fail the upload + # In self-hosted setups with GCP, do not add unsupported headers that fail the upload is_self_hosted_gcs_bucket = "storage.googleapis.com" in upload_url if not is_self_hosted_gcs_bucket: From f94dab3b9494eb78c42b1b5d53d7e92f2b04b1de Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Fri, 12 Sep 2025 14:08:32 +0200 Subject: [PATCH 11/25] add autoevals adapter --- langfuse/_client/experiments.py | 32 +++ langfuse/experiment.py | 25 ++ langfuse/types.py | 25 -- poetry.lock | 460 ++++++++++++++++++++++++-------- pyproject.toml | 1 + 5 files changed, 401 insertions(+), 142 deletions(-) create mode 100644 langfuse/experiment.py diff --git a/langfuse/_client/experiments.py b/langfuse/_client/experiments.py index 5833af70a..2e004d686 100644 --- a/langfuse/_client/experiments.py +++ b/langfuse/_client/experiments.py @@ -747,3 +747,35 @@ async def _run_task(task: TaskFunction, item: ExperimentItem) -> Any: result = await result return result + + +def create_evaluator_from_autoevals( + autoevals_evaluator: Any, **kwargs: Optional[Dict[str, Any]] +) -> EvaluatorFunction: + """Create a Langfuse evaluator from an autoevals evaluator. + + Args: + autoevals_evaluator: An autoevals evaluator instance + **kwargs: Additional arguments passed to the evaluator + + Returns: + A Langfuse-compatible evaluator function + """ + + def langfuse_evaluator( + *, + input: Any, + output: Any, + expected_output: Any, + metadata: Optional[Dict[str, Any]], + **kwargs: Dict[str, Any], + ) -> Evaluation: + evaluation = autoevals_evaluator( + input=input, output=output, expected=expected_output, **kwargs + ) + + return Evaluation( + name=evaluation.name, value=evaluation.score, metadata=evaluation.metadata + ) + + return langfuse_evaluator diff --git a/langfuse/experiment.py b/langfuse/experiment.py new file mode 100644 index 000000000..2d54255e2 --- /dev/null +++ b/langfuse/experiment.py @@ -0,0 +1,25 @@ +from ._client.experiments import ( + Evaluation, + EvaluatorFunction, + ExperimentData, + ExperimentItem, + ExperimentItemResult, + ExperimentResult, + LocalExperimentItem, + RunEvaluatorFunction, + TaskFunction, + create_evaluator_from_autoevals, +) + +__all__ = [ + "LocalExperimentItem", + "ExperimentItem", + "ExperimentData", + "Evaluation", + "ExperimentItemResult", + "ExperimentResult", + "TaskFunction", + "EvaluatorFunction", + "RunEvaluatorFunction", + "create_evaluator_from_autoevals", +] diff --git a/langfuse/types.py b/langfuse/types.py index 8a186a345..32ebb32d4 100644 --- a/langfuse/types.py +++ b/langfuse/types.py @@ -39,19 +39,6 @@ def my_evaluator(*, output: str, **kwargs) -> Evaluation: from langfuse.api import MediaContentType, UsageDetails from langfuse.model import MapValue, ModelUsage, PromptClient -# Experiment types -from ._client.experiments import ( - LocalExperimentItem, - ExperimentItem, - ExperimentData, - Evaluation, - ExperimentItemResult, - ExperimentResult, - TaskFunction, - EvaluatorFunction, - RunEvaluatorFunction, -) - SpanLevel = Literal["DEBUG", "DEFAULT", "WARNING", "ERROR"] ScoreDataType = Literal["NUMERIC", "CATEGORICAL", "BOOLEAN"] @@ -116,19 +103,7 @@ class TraceContext(TypedDict): parent_span_id: NotRequired[str] -# Export experiment types for easy access __all__ = [ - # Experiment types - "LocalExperimentItem", - "ExperimentItem", - "ExperimentData", - "Evaluation", - "ExperimentItemResult", - "ExperimentResult", - "TaskFunction", - "EvaluatorFunction", - "RunEvaluatorFunction", - # Core types (keeping existing functionality) "SpanLevel", "ScoreDataType", "TraceMetadata", diff --git a/poetry.lock b/poetry.lock index 3380643bd..2cdb8e476 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "annotated-types" @@ -6,7 +6,6 @@ version = "0.7.0" description = "Reusable constraint types to use with typing.Annotated" optional = false python-versions = ">=3.8" -groups = ["main", "dev"] files = [ {file = "annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53"}, {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"}, @@ -18,7 +17,6 @@ version = "4.10.0" description = "High-level concurrency and networking framework on top of asyncio or Trio" optional = false python-versions = ">=3.9" -groups = ["main", "dev"] files = [ {file = "anyio-4.10.0-py3-none-any.whl", hash = "sha256:60e474ac86736bbfd6f210f7a61218939c318f43f9972497381f1c5e930ed3d1"}, {file = "anyio-4.10.0.tar.gz", hash = "sha256:3f3fae35c96039744587aa5b8371e7e8e603c0702999535961dd336026973ba6"}, @@ -39,20 +37,59 @@ version = "4.0.3" description = "Timeout context manager for asyncio programs" optional = true python-versions = ">=3.7" -groups = ["main"] -markers = "extra == \"langchain\" and python_version < \"3.11\"" files = [ {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"}, {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"}, ] +[[package]] +name = "attrs" +version = "25.3.0" +description = "Classes Without Boilerplate" +optional = false +python-versions = ">=3.8" +files = [ + {file = "attrs-25.3.0-py3-none-any.whl", hash = "sha256:427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3"}, + {file = "attrs-25.3.0.tar.gz", hash = "sha256:75d7cefc7fb576747b2c81b4442d4d4a1ce0900973527c011d1030fd3bf4af1b"}, +] + +[package.extras] +benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit-uv", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier"] +tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"] + +[[package]] +name = "autoevals" +version = "0.0.130" +description = "Universal library for evaluating AI models" +optional = false +python-versions = ">=3.8.0" +files = [ + {file = "autoevals-0.0.130-py3-none-any.whl", hash = "sha256:ffb7b3a21070d2a4e593bb118180c04e43531e608bffd854624377bd857ceec0"}, + {file = "autoevals-0.0.130.tar.gz", hash = "sha256:92f87ab95a575b56d9d7377e6f1399932d09180d2f3a8266b4f693f46f49b86d"}, +] + +[package.dependencies] +chevron = "*" +jsonschema = "*" +polyleven = "*" +pyyaml = "*" + +[package.extras] +all = ["IPython", "black (==22.6.0)", "braintrust", "build", "flake8", "flake8-isort", "isort (==5.12.0)", "numpy", "openai", "pre-commit", "pydoc-markdown", "pytest", "respx", "scipy", "twine"] +dev = ["IPython", "black (==22.6.0)", "braintrust", "build", "flake8", "flake8-isort", "isort (==5.12.0)", "openai", "pre-commit", "pytest", "respx", "twine"] +doc = ["pydoc-markdown"] +scipy = ["numpy", "scipy"] + [[package]] name = "backoff" version = "2.2.1" description = "Function decoration for backoff and retry" optional = false python-versions = ">=3.7,<4.0" -groups = ["main"] files = [ {file = "backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8"}, {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"}, @@ -64,8 +101,6 @@ version = "1.2.0" description = "Backport of asyncio.Runner, a context manager that controls event loop life cycle." optional = false python-versions = "<3.11,>=3.8" -groups = ["dev"] -markers = "python_version < \"3.11\"" files = [ {file = "backports_asyncio_runner-1.2.0-py3-none-any.whl", hash = "sha256:0da0a936a8aeb554eccb426dc55af3ba63bcdc69fa1a600b5bb305413a4477b5"}, {file = "backports_asyncio_runner-1.2.0.tar.gz", hash = "sha256:a5aa7b2b7d8f8bfcaa2b57313f70792df84e32a2a746f585213373f900b42162"}, @@ -77,7 +112,6 @@ version = "2025.8.3" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.7" -groups = ["main", "dev"] files = [ {file = "certifi-2025.8.3-py3-none-any.whl", hash = "sha256:f6c12493cfb1b06ba2ff328595af9350c65d6644968e5d3a2ffd78699af217a5"}, {file = "certifi-2025.8.3.tar.gz", hash = "sha256:e564105f78ded564e3ae7c923924435e1daa7463faeab5bb932bc53ffae63407"}, @@ -89,7 +123,6 @@ version = "3.4.0" description = "Validate configuration and produce human readable error messages." optional = false python-versions = ">=3.8" -groups = ["dev"] files = [ {file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"}, {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"}, @@ -101,7 +134,6 @@ version = "3.4.3" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." optional = false python-versions = ">=3.7" -groups = ["main", "dev"] files = [ {file = "charset_normalizer-3.4.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:fb7f67a1bfa6e40b438170ebdc8158b78dc465a5a67b6dde178a46987b244a72"}, {file = "charset_normalizer-3.4.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cc9370a2da1ac13f0153780040f465839e6cccb4a1e44810124b4e22483c93fe"}, @@ -184,18 +216,27 @@ files = [ {file = "charset_normalizer-3.4.3.tar.gz", hash = "sha256:6fce4b8500244f6fcb71465d4a4930d132ba9ab8e71a7859e6a5d59851068d14"}, ] +[[package]] +name = "chevron" +version = "0.14.0" +description = "Mustache templating language renderer" +optional = false +python-versions = "*" +files = [ + {file = "chevron-0.14.0-py3-none-any.whl", hash = "sha256:fbf996a709f8da2e745ef763f482ce2d311aa817d287593a5b990d6d6e4f0443"}, + {file = "chevron-0.14.0.tar.gz", hash = "sha256:87613aafdf6d77b6a90ff073165a61ae5086e21ad49057aa0e53681601800ebf"}, +] + [[package]] name = "colorama" version = "0.4.6" description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" -groups = ["main", "dev"] files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] -markers = {main = "extra == \"openai\" and platform_system == \"Windows\"", dev = "platform_system == \"Windows\" or sys_platform == \"win32\""} [[package]] name = "distlib" @@ -203,7 +244,6 @@ version = "0.4.0" description = "Distribution utilities" optional = false python-versions = "*" -groups = ["dev"] files = [ {file = "distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16"}, {file = "distlib-0.4.0.tar.gz", hash = "sha256:feec40075be03a04501a973d81f633735b4b69f98b05450592310c0f401a4e0d"}, @@ -215,12 +255,10 @@ version = "1.9.0" description = "Distro - an OS platform information API" optional = false python-versions = ">=3.6" -groups = ["main", "dev"] files = [ {file = "distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2"}, {file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"}, ] -markers = {main = "extra == \"openai\""} [[package]] name = "exceptiongroup" @@ -228,8 +266,6 @@ version = "1.3.0" description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" -groups = ["main", "dev"] -markers = "python_version < \"3.11\"" files = [ {file = "exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10"}, {file = "exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88"}, @@ -247,7 +283,6 @@ version = "2.1.1" description = "execnet: rapid multi-Python deployment" optional = false python-versions = ">=3.8" -groups = ["dev"] files = [ {file = "execnet-2.1.1-py3-none-any.whl", hash = "sha256:26dee51f1b80cebd6d0ca8e74dd8745419761d3bef34163928cbebbdc4749fdc"}, {file = "execnet-2.1.1.tar.gz", hash = "sha256:5189b52c6121c24feae288166ab41b32549c7e2348652736540b9e6e7d4e72e3"}, @@ -262,7 +297,6 @@ version = "3.19.1" description = "A platform independent file lock." optional = false python-versions = ">=3.9" -groups = ["dev"] files = [ {file = "filelock-3.19.1-py3-none-any.whl", hash = "sha256:d38e30481def20772f5baf097c122c3babc4fcdb7e14e57049eb9d88c6dc017d"}, {file = "filelock-3.19.1.tar.gz", hash = "sha256:66eda1888b0171c998b35be2bcc0f6d75c388a7ce20c3f3f37aa8e96c2dddf58"}, @@ -274,7 +308,6 @@ version = "1.70.0" description = "Common protobufs used in Google APIs" optional = false python-versions = ">=3.7" -groups = ["main"] files = [ {file = "googleapis_common_protos-1.70.0-py3-none-any.whl", hash = "sha256:b8bfcca8c25a2bb253e0e0b0adaf8c00773e5e6af6fd92397576680b807e0fd8"}, {file = "googleapis_common_protos-1.70.0.tar.gz", hash = "sha256:0e1b44e0ea153e6594f9f394fef15193a68aaaea2d843f83e2742717ca753257"}, @@ -292,8 +325,6 @@ version = "3.2.4" description = "Lightweight in-process concurrent programming" optional = true python-versions = ">=3.9" -groups = ["main"] -markers = "python_version < \"3.14\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\") and extra == \"langchain\"" files = [ {file = "greenlet-3.2.4-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:8c68325b0d0acf8d91dde4e6f930967dd52a5302cd4062932a6b2e7c2969f47c"}, {file = "greenlet-3.2.4-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:94385f101946790ae13da500603491f04a76b6e4c059dab271b3ce2e283b2590"}, @@ -361,7 +392,6 @@ version = "0.16.0" description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" optional = false python-versions = ">=3.8" -groups = ["main", "dev"] files = [ {file = "h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86"}, {file = "h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1"}, @@ -373,7 +403,6 @@ version = "1.0.9" description = "A minimal low-level HTTP client." optional = false python-versions = ">=3.8" -groups = ["main", "dev"] files = [ {file = "httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55"}, {file = "httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8"}, @@ -395,7 +424,6 @@ version = "0.28.1" description = "The next generation HTTP client." optional = false python-versions = ">=3.8" -groups = ["main", "dev"] files = [ {file = "httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad"}, {file = "httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc"}, @@ -408,7 +436,7 @@ httpcore = "==1.*" idna = "*" [package.extras] -brotli = ["brotli ; platform_python_implementation == \"CPython\"", "brotlicffi ; platform_python_implementation != \"CPython\""] +brotli = ["brotli", "brotlicffi"] cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] http2 = ["h2 (>=3,<5)"] socks = ["socksio (==1.*)"] @@ -420,7 +448,6 @@ version = "2.6.13" description = "File identification library for Python" optional = false python-versions = ">=3.9" -groups = ["dev"] files = [ {file = "identify-2.6.13-py2.py3-none-any.whl", hash = "sha256:60381139b3ae39447482ecc406944190f690d4a2997f2584062089848361b33b"}, {file = "identify-2.6.13.tar.gz", hash = "sha256:da8d6c828e773620e13bfa86ea601c5a5310ba4bcd65edf378198b56a1f9fb32"}, @@ -435,7 +462,6 @@ version = "3.10" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.6" -groups = ["main", "dev"] files = [ {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"}, {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"}, @@ -450,7 +476,6 @@ version = "8.7.0" description = "Read metadata from Python packages" optional = false python-versions = ">=3.9" -groups = ["main"] files = [ {file = "importlib_metadata-8.7.0-py3-none-any.whl", hash = "sha256:e5dd1551894c77868a30651cef00984d50e1002d06942a7101d34870c5f02afd"}, {file = "importlib_metadata-8.7.0.tar.gz", hash = "sha256:d13b81ad223b890aa16c5471f2ac3056cf76c5f10f82d6f9292f0b415f389000"}, @@ -460,12 +485,12 @@ files = [ zipp = ">=3.20" [package.extras] -check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\""] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] cover = ["pytest-cov"] doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] enabler = ["pytest-enabler (>=2.2)"] perf = ["ipython"] -test = ["flufl.flake8", "importlib_resources (>=1.3) ; python_version < \"3.9\"", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"] +test = ["flufl.flake8", "importlib_resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"] type = ["pytest-mypy"] [[package]] @@ -474,7 +499,6 @@ version = "2.1.0" description = "brain-dead simple config-ini parsing" optional = false python-versions = ">=3.8" -groups = ["dev"] files = [ {file = "iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760"}, {file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"}, @@ -486,7 +510,6 @@ version = "3.1.6" description = "A very fast and expressive template engine." optional = false python-versions = ">=3.7" -groups = ["docs"] files = [ {file = "jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67"}, {file = "jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d"}, @@ -504,7 +527,6 @@ version = "0.10.0" description = "Fast iterable JSON parser." optional = false python-versions = ">=3.9" -groups = ["main", "dev"] files = [ {file = "jiter-0.10.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:cd2fb72b02478f06a900a5782de2ef47e0396b3e1f7d5aba30daeb1fce66f303"}, {file = "jiter-0.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:32bb468e3af278f095d3fa5b90314728a6916d89ba3d0ffb726dd9bf7367285e"}, @@ -584,7 +606,6 @@ files = [ {file = "jiter-0.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:1b28302349dc65703a9e4ead16f163b1c339efffbe1049c30a44b001a2a4fff9"}, {file = "jiter-0.10.0.tar.gz", hash = "sha256:07a7142c38aacc85194391108dc91b5b57093c978a9932bd86a36862759d9500"}, ] -markers = {main = "extra == \"openai\""} [[package]] name = "jsonpatch" @@ -592,12 +613,10 @@ version = "1.33" description = "Apply JSON-Patches (RFC 6902)" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*" -groups = ["main", "dev"] files = [ {file = "jsonpatch-1.33-py2.py3-none-any.whl", hash = "sha256:0ae28c0cd062bbd8b8ecc26d7d164fbbea9652a1a3693f3b956c1eae5145dade"}, {file = "jsonpatch-1.33.tar.gz", hash = "sha256:9fcd4009c41e6d12348b4a0ff2563ba56a2923a7dfee731d004e212e1ee5030c"}, ] -markers = {main = "extra == \"langchain\""} [package.dependencies] jsonpointer = ">=1.9" @@ -608,12 +627,45 @@ version = "3.0.0" description = "Identify specific nodes in a JSON document (RFC 6901)" optional = false python-versions = ">=3.7" -groups = ["main", "dev"] files = [ {file = "jsonpointer-3.0.0-py2.py3-none-any.whl", hash = "sha256:13e088adc14fca8b6aa8177c044e12701e6ad4b28ff10e65f2267a90109c9942"}, {file = "jsonpointer-3.0.0.tar.gz", hash = "sha256:2b2d729f2091522d61c3b31f82e11870f60b68f43fbc705cb76bf4b832af59ef"}, ] -markers = {main = "extra == \"langchain\""} + +[[package]] +name = "jsonschema" +version = "4.25.1" +description = "An implementation of JSON Schema validation for Python" +optional = false +python-versions = ">=3.9" +files = [ + {file = "jsonschema-4.25.1-py3-none-any.whl", hash = "sha256:3fba0169e345c7175110351d456342c364814cfcf3b964ba4587f22915230a63"}, + {file = "jsonschema-4.25.1.tar.gz", hash = "sha256:e4a9655ce0da0c0b67a085847e00a3a51449e1157f4f75e9fb5aa545e122eb85"}, +] + +[package.dependencies] +attrs = ">=22.2.0" +jsonschema-specifications = ">=2023.03.6" +referencing = ">=0.28.4" +rpds-py = ">=0.7.1" + +[package.extras] +format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"] +format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "rfc3987-syntax (>=1.1.0)", "uri-template", "webcolors (>=24.6.0)"] + +[[package]] +name = "jsonschema-specifications" +version = "2025.9.1" +description = "The JSON Schema meta-schemas and vocabularies, exposed as a Registry" +optional = false +python-versions = ">=3.9" +files = [ + {file = "jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe"}, + {file = "jsonschema_specifications-2025.9.1.tar.gz", hash = "sha256:b540987f239e745613c7a9176f3edb72b832a4ac465cf02712288397832b5e8d"}, +] + +[package.dependencies] +referencing = ">=0.31.0" [[package]] name = "langchain" @@ -621,8 +673,6 @@ version = "0.3.27" description = "Building applications with LLMs through composability" optional = true python-versions = "<4.0,>=3.9" -groups = ["main"] -markers = "extra == \"langchain\"" files = [ {file = "langchain-0.3.27-py3-none-any.whl", hash = "sha256:7b20c4f338826acb148d885b20a73a16e410ede9ee4f19bb02011852d5f98798"}, {file = "langchain-0.3.27.tar.gz", hash = "sha256:aa6f1e6274ff055d0fd36254176770f356ed0a8994297d1df47df341953cec62"}, @@ -663,12 +713,10 @@ version = "0.3.75" description = "Building applications with LLMs through composability" optional = false python-versions = ">=3.9" -groups = ["main", "dev"] files = [ {file = "langchain_core-0.3.75-py3-none-any.whl", hash = "sha256:03ca1fadf955ee3c7d5806a841f4b3a37b816acea5e61a7e6ba1298c05eea7f5"}, {file = "langchain_core-0.3.75.tar.gz", hash = "sha256:ab0eb95a06ed6043f76162e6086b45037690cb70b7f090bd83b5ebb8a05b70ed"}, ] -markers = {main = "extra == \"langchain\""} [package.dependencies] jsonpatch = ">=1.33,<2.0" @@ -685,7 +733,6 @@ version = "0.3.32" description = "An integration package connecting OpenAI and LangChain" optional = false python-versions = ">=3.9" -groups = ["dev"] files = [ {file = "langchain_openai-0.3.32-py3-none-any.whl", hash = "sha256:3354f76822f7cc76d8069831fe2a77f9bc7ff3b4f13af788bd94e4c6e853b400"}, {file = "langchain_openai-0.3.32.tar.gz", hash = "sha256:782ad669bd1bdb964456d8882c5178717adcfceecb482cc20005f770e43d346d"}, @@ -702,8 +749,6 @@ version = "0.3.9" description = "LangChain text splitting utilities" optional = true python-versions = ">=3.9" -groups = ["main"] -markers = "extra == \"langchain\"" files = [ {file = "langchain_text_splitters-0.3.9-py3-none-any.whl", hash = "sha256:cee0bb816211584ea79cc79927317c358543f40404bcfdd69e69ba3ccde54401"}, {file = "langchain_text_splitters-0.3.9.tar.gz", hash = "sha256:7cd1e5a3aaf609979583eeca2eb34177622570b8fa8f586a605c6b1c34e7ebdb"}, @@ -718,7 +763,6 @@ version = "0.6.6" description = "Building stateful, multi-actor applications with LLMs" optional = false python-versions = ">=3.9" -groups = ["dev"] files = [ {file = "langgraph-0.6.6-py3-none-any.whl", hash = "sha256:a2283a5236abba6c8307c1a485c04e8a0f0ffa2be770878782a7bf2deb8d7954"}, {file = "langgraph-0.6.6.tar.gz", hash = "sha256:e7d3cefacf356f8c01721b166b67b3bf581659d5361a3530f59ecd9b8448eca7"}, @@ -738,7 +782,6 @@ version = "2.1.1" description = "Library with base interfaces for LangGraph checkpoint savers." optional = false python-versions = ">=3.9" -groups = ["dev"] files = [ {file = "langgraph_checkpoint-2.1.1-py3-none-any.whl", hash = "sha256:5a779134fd28134a9a83d078be4450bbf0e0c79fdf5e992549658899e6fc5ea7"}, {file = "langgraph_checkpoint-2.1.1.tar.gz", hash = "sha256:72038c0f9e22260cb9bff1f3ebe5eb06d940b7ee5c1e4765019269d4f21cf92d"}, @@ -754,7 +797,6 @@ version = "0.6.4" description = "Library with high-level APIs for creating and executing LangGraph agents and tools." optional = false python-versions = ">=3.9" -groups = ["dev"] files = [ {file = "langgraph_prebuilt-0.6.4-py3-none-any.whl", hash = "sha256:819f31d88b84cb2729ff1b79db2d51e9506b8fb7aaacfc0d359d4fe16e717344"}, {file = "langgraph_prebuilt-0.6.4.tar.gz", hash = "sha256:e9e53b906ee5df46541d1dc5303239e815d3ec551e52bb03dd6463acc79ec28f"}, @@ -770,7 +812,6 @@ version = "0.2.3" description = "SDK for interacting with LangGraph API" optional = false python-versions = ">=3.9" -groups = ["dev"] files = [ {file = "langgraph_sdk-0.2.3-py3-none-any.whl", hash = "sha256:059edfe2f62708c2e54239e170f5a33f796d456dbdbde64276c16cac8b97ba99"}, {file = "langgraph_sdk-0.2.3.tar.gz", hash = "sha256:17398aeae0f937cae1c8eb9027ada2969abdb50fe8ed3246c78f543b679cf959"}, @@ -786,12 +827,10 @@ version = "0.4.19" description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform." optional = false python-versions = ">=3.9" -groups = ["main", "dev"] files = [ {file = "langsmith-0.4.19-py3-none-any.whl", hash = "sha256:4c50ae47e9f8430a06adb54bceaf32808f5e54fcb8186731bf7b2dab3fc30621"}, {file = "langsmith-0.4.19.tar.gz", hash = "sha256:71916bef574f72c40887ce371a4502d80c80efc2a053df123f1347e79ea83dca"}, ] -markers = {main = "extra == \"langchain\""} [package.dependencies] httpx = ">=0.23.0,<1" @@ -815,7 +854,6 @@ version = "3.0.2" description = "Safely add untrusted strings to HTML/XML markup." optional = false python-versions = ">=3.9" -groups = ["dev", "docs"] files = [ {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8"}, {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158"}, @@ -886,7 +924,6 @@ version = "1.17.1" description = "Optional static typing for Python" optional = false python-versions = ">=3.9" -groups = ["dev"] files = [ {file = "mypy-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3fbe6d5555bf608c47203baa3e72dbc6ec9965b3d7c318aa9a4ca76f465bd972"}, {file = "mypy-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:80ef5c058b7bce08c83cac668158cb7edea692e458d21098c7d3bce35a5d43e7"}, @@ -947,7 +984,6 @@ version = "1.1.0" description = "Type system extensions for programs checked with the mypy type checker." optional = false python-versions = ">=3.8" -groups = ["dev"] files = [ {file = "mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505"}, {file = "mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558"}, @@ -959,7 +995,6 @@ version = "1.9.1" description = "Node.js virtual environment builder" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" -groups = ["dev"] files = [ {file = "nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9"}, {file = "nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f"}, @@ -971,12 +1006,10 @@ version = "1.102.0" description = "The official Python library for the openai API" optional = false python-versions = ">=3.8" -groups = ["main", "dev"] files = [ {file = "openai-1.102.0-py3-none-any.whl", hash = "sha256:d751a7e95e222b5325306362ad02a7aa96e1fab3ed05b5888ce1c7ca63451345"}, {file = "openai-1.102.0.tar.gz", hash = "sha256:2e0153bcd64a6523071e90211cbfca1f2bbc5ceedd0993ba932a5869f93b7fc9"}, ] -markers = {main = "extra == \"openai\""} [package.dependencies] anyio = ">=3.5.0,<5" @@ -1000,7 +1033,6 @@ version = "1.36.0" description = "OpenTelemetry Python API" optional = false python-versions = ">=3.9" -groups = ["main"] files = [ {file = "opentelemetry_api-1.36.0-py3-none-any.whl", hash = "sha256:02f20bcacf666e1333b6b1f04e647dc1d5111f86b8e510238fcc56d7762cda8c"}, {file = "opentelemetry_api-1.36.0.tar.gz", hash = "sha256:9a72572b9c416d004d492cbc6e61962c0501eaf945ece9b5a0f56597d8348aa0"}, @@ -1016,7 +1048,6 @@ version = "1.36.0" description = "OpenTelemetry Protobuf encoding" optional = false python-versions = ">=3.9" -groups = ["main"] files = [ {file = "opentelemetry_exporter_otlp_proto_common-1.36.0-py3-none-any.whl", hash = "sha256:0fc002a6ed63eac235ada9aa7056e5492e9a71728214a61745f6ad04b923f840"}, {file = "opentelemetry_exporter_otlp_proto_common-1.36.0.tar.gz", hash = "sha256:6c496ccbcbe26b04653cecadd92f73659b814c6e3579af157d8716e5f9f25cbf"}, @@ -1031,7 +1062,6 @@ version = "1.36.0" description = "OpenTelemetry Collector Protobuf over HTTP Exporter" optional = false python-versions = ">=3.9" -groups = ["main"] files = [ {file = "opentelemetry_exporter_otlp_proto_http-1.36.0-py3-none-any.whl", hash = "sha256:3d769f68e2267e7abe4527f70deb6f598f40be3ea34c6adc35789bea94a32902"}, {file = "opentelemetry_exporter_otlp_proto_http-1.36.0.tar.gz", hash = "sha256:dd3637f72f774b9fc9608ab1ac479f8b44d09b6fb5b2f3df68a24ad1da7d356e"}, @@ -1052,7 +1082,6 @@ version = "1.36.0" description = "OpenTelemetry Python Proto" optional = false python-versions = ">=3.9" -groups = ["main"] files = [ {file = "opentelemetry_proto-1.36.0-py3-none-any.whl", hash = "sha256:151b3bf73a09f94afc658497cf77d45a565606f62ce0c17acb08cd9937ca206e"}, {file = "opentelemetry_proto-1.36.0.tar.gz", hash = "sha256:0f10b3c72f74c91e0764a5ec88fd8f1c368ea5d9c64639fb455e2854ef87dd2f"}, @@ -1067,7 +1096,6 @@ version = "1.36.0" description = "OpenTelemetry Python SDK" optional = false python-versions = ">=3.9" -groups = ["main"] files = [ {file = "opentelemetry_sdk-1.36.0-py3-none-any.whl", hash = "sha256:19fe048b42e98c5c1ffe85b569b7073576ad4ce0bcb6e9b4c6a39e890a6c45fb"}, {file = "opentelemetry_sdk-1.36.0.tar.gz", hash = "sha256:19c8c81599f51b71670661ff7495c905d8fdf6976e41622d5245b791b06fa581"}, @@ -1084,7 +1112,6 @@ version = "0.57b0" description = "OpenTelemetry Semantic Conventions" optional = false python-versions = ">=3.9" -groups = ["main"] files = [ {file = "opentelemetry_semantic_conventions-0.57b0-py3-none-any.whl", hash = "sha256:757f7e76293294f124c827e514c2a3144f191ef175b069ce8d1211e1e38e9e78"}, {file = "opentelemetry_semantic_conventions-0.57b0.tar.gz", hash = "sha256:609a4a79c7891b4620d64c7aac6898f872d790d75f22019913a660756f27ff32"}, @@ -1100,7 +1127,6 @@ version = "3.11.3" description = "Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy" optional = false python-versions = ">=3.9" -groups = ["main", "dev"] files = [ {file = "orjson-3.11.3-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:29cb1f1b008d936803e2da3d7cba726fc47232c45df531b29edf0b232dd737e7"}, {file = "orjson-3.11.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:97dceed87ed9139884a55db8722428e27bd8452817fbf1869c58b49fecab1120"}, @@ -1186,7 +1212,6 @@ files = [ {file = "orjson-3.11.3-cp39-cp39-win_amd64.whl", hash = "sha256:215c595c792a87d4407cb72dd5e0f6ee8e694ceeb7f9102b533c5a9bf2a916bb"}, {file = "orjson-3.11.3.tar.gz", hash = "sha256:1c0603b1d2ffcd43a411d64797a19556ef76958aef1c182f22dc30860152a98a"}, ] -markers = {main = "extra == \"langchain\" and platform_python_implementation != \"PyPy\""} [[package]] name = "ormsgpack" @@ -1194,7 +1219,6 @@ version = "1.10.0" description = "Fast, correct Python msgpack library supporting dataclasses, datetimes, and numpy" optional = false python-versions = ">=3.9" -groups = ["dev"] files = [ {file = "ormsgpack-1.10.0-cp310-cp310-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:8a52c7ce7659459f3dc8dec9fd6a6c76f855a0a7e2b61f26090982ac10b95216"}, {file = "ormsgpack-1.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:060f67fe927582f4f63a1260726d019204b72f460cf20930e6c925a1d129f373"}, @@ -1245,7 +1269,6 @@ version = "25.0" description = "Core utilities for Python packages" optional = false python-versions = ">=3.8" -groups = ["main", "dev"] files = [ {file = "packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484"}, {file = "packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f"}, @@ -1257,7 +1280,6 @@ version = "0.12.1" description = "Utility library for gitignore style pattern matching of file paths." optional = false python-versions = ">=3.8" -groups = ["dev"] files = [ {file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"}, {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"}, @@ -1269,7 +1291,6 @@ version = "15.0.4" description = "API Documentation for Python Projects" optional = false python-versions = ">=3.9" -groups = ["docs"] files = [ {file = "pdoc-15.0.4-py3-none-any.whl", hash = "sha256:f9028e85e7bb8475b054e69bde1f6d26fc4693d25d9fa1b1ce9009bec7f7a5c4"}, {file = "pdoc-15.0.4.tar.gz", hash = "sha256:cf9680f10f5b4863381f44ef084b1903f8f356acb0d4cc6b64576ba9fb712c82"}, @@ -1286,7 +1307,6 @@ version = "4.4.0" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." optional = false python-versions = ">=3.9" -groups = ["dev"] files = [ {file = "platformdirs-4.4.0-py3-none-any.whl", hash = "sha256:abd01743f24e5287cd7a5db3752faf1a2d65353f38ec26d98e25a6db65958c85"}, {file = "platformdirs-4.4.0.tar.gz", hash = "sha256:ca753cf4d81dc309bc67b0ea38fd15dc97bc30ce419a7f58d13eb3bf14c4febf"}, @@ -1303,7 +1323,6 @@ version = "1.6.0" description = "plugin and hook calling mechanisms for python" optional = false python-versions = ">=3.9" -groups = ["dev"] files = [ {file = "pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746"}, {file = "pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3"}, @@ -1313,13 +1332,76 @@ files = [ dev = ["pre-commit", "tox"] testing = ["coverage", "pytest", "pytest-benchmark"] +[[package]] +name = "polyleven" +version = "0.9.0" +description = "A fast C-implemented library for Levenshtein distance" +optional = false +python-versions = ">=3.8" +files = [ + {file = "polyleven-0.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6e00207fbe0fcdde206b9b277cf14bb9db8801f8d303204b1572870797399974"}, + {file = "polyleven-0.9.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d400f255af038f77b37d5010532e0e82d07160457c8282e5b40632987ab815be"}, + {file = "polyleven-0.9.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a1d3f1b385e9f51090beca54925a0fd0ab2d744fcea91dd9353c7b13bbb274f"}, + {file = "polyleven-0.9.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:2be92bb7743e3b3e14a2b894902f4ceeea5700849dd9e9ab59c68bd7943b3d85"}, + {file = "polyleven-0.9.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7bd784bad5164d0d4e823d98aa8ffdc118c14d211dfd7271ede7f1baa7efc691"}, + {file = "polyleven-0.9.0-cp310-cp310-win32.whl", hash = "sha256:bac610f5a30b56ab2fbb1a3de071ef9ed3aa6a572a80a4cfbf0665929e0f6451"}, + {file = "polyleven-0.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:4e4ab3cfc196907751adb3b65959ad8be08fc06679d071fdf01e5225f394812e"}, + {file = "polyleven-0.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e58bbcd3f062043fa67e76e89f803eb308ea06fbb4dc6f32d7063c37f1c16dfd"}, + {file = "polyleven-0.9.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3fd803de02e99f51ade3fcae4e5be50c89c1ff360213bcdbcf98820e2633c71a"}, + {file = "polyleven-0.9.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff60e2da0864b3d4bec2826eadbbb0a8967384d53bec9e693aad7b0089e1258c"}, + {file = "polyleven-0.9.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:259856641423ca82230237d637869301ba02971c24283101b67c8117e7116b7a"}, + {file = "polyleven-0.9.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a46e7b364b3936f025022d1182e10cba9ac45974dc2cafa17b7f9f515784adb5"}, + {file = "polyleven-0.9.0-cp311-cp311-win32.whl", hash = "sha256:6f0fd999efaa0d5409603ae7e44b60152b8d12a190b54115bcf0ba93e41e09f1"}, + {file = "polyleven-0.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:65a6e899db184bce6384526e46f446c6c159a2b0bb3b463dcc78a2bc8ddf85f5"}, + {file = "polyleven-0.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1b9c905fa0862c1f3e27e948a713fb86a26ce1659f1d90b1b4aff04a8890213b"}, + {file = "polyleven-0.9.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7058bea0da4893ebb8bedd9f638ec4e026c150e29b7b7385db5c157742d0ff11"}, + {file = "polyleven-0.9.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b99fcfc48c1eaacc4a46dd9d22dc98de111120c66b56df14257f276b762bd591"}, + {file = "polyleven-0.9.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:29ef7db85a7bb01be9372461bc8d8993d4817dfcea702e4d2b8f0d9c43415ebe"}, + {file = "polyleven-0.9.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:288bfe0a0040421c52a5dc312b55c47812a72fb9cd7e6d19859ac2f9f11f350f"}, + {file = "polyleven-0.9.0-cp312-cp312-win32.whl", hash = "sha256:7260fa32fff7194e06b4221e0a6d2ba2decd4e4dc51f7f8cddbf365649326ee4"}, + {file = "polyleven-0.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:4db8b16aac237dbf644a0e4323c3ba0907dab6adecd2a345bf2fa92301d7fb2d"}, + {file = "polyleven-0.9.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:45cea2885c61bda9711244a51aed068f9a55f1d776d4caad6c574a3f401945ae"}, + {file = "polyleven-0.9.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62b039e9dc8fa53ad740de02d168a7e9d0edce3734b2927f40fe851b328b766f"}, + {file = "polyleven-0.9.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0a0c1ecd2dc356fd94edc80e18a30ad28e93ccc840127e765b83ad60426b2d5"}, + {file = "polyleven-0.9.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:20576da0c8000bd1c4a07cee43db9169b7d094f5dcc03b20775506d07c56f4fb"}, + {file = "polyleven-0.9.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ba356ce9e7e7e8ddf4eff17eb39df5b822cb8899450c6d289a22249b78c9a5f4"}, + {file = "polyleven-0.9.0-cp313-cp313-win32.whl", hash = "sha256:244d759986486252121061d727a642d3505cbdd9e6616467b42935e662a9fa61"}, + {file = "polyleven-0.9.0-cp313-cp313-win_amd64.whl", hash = "sha256:8f671df664924b3ec14195be7bf778d5f71811989e59a3f9547f8066cefc596f"}, + {file = "polyleven-0.9.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7309296f1f91e7aa7d292e5b9aa0da53f2ce7997cfda8535155424a791fe73c8"}, + {file = "polyleven-0.9.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:50c71e238153acdf010c7fe6f18835dd6d7ca37a7e7cca08d51c2234e2227019"}, + {file = "polyleven-0.9.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ecf0a858b7694acea0f7459f8699f8b1f62ee99d88529b01f3a1597aa4c53978"}, + {file = "polyleven-0.9.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:c903c9b70a089c5f2b5990ce3a09ac1ce39d0b1ea93ec8c9e1eb217ddea779c6"}, + {file = "polyleven-0.9.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:e9608f5835f8fb3778aaad2b126aaea201cd9a6b210286533762c29cd3debcf2"}, + {file = "polyleven-0.9.0-cp38-cp38-win32.whl", hash = "sha256:aabd963fef557f6afe4306920cbd6c580aff572c8a96c5d6bf572fb9c4bdce46"}, + {file = "polyleven-0.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:e8c4c3c6515f4753fe69becb4686009bc5a5776752fd27a3d34d89f54f8c40e6"}, + {file = "polyleven-0.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c672c982108a48c7aebd7016aa8482b8ee96f01280a68cbee56293055aebdfc7"}, + {file = "polyleven-0.9.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1a4f857c9f7fd99b7e41305e6cdb30d39592b1a6ca50fbc20edd175746e376ca"}, + {file = "polyleven-0.9.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26e06e1da0734c8d5a1625589d2bd213f9d40d0023370475c167dc773239ab78"}, + {file = "polyleven-0.9.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:9859199fefc85329b495cd0ce5b34df1a9acf6623d3dbaff5fcb688ade59fb88"}, + {file = "polyleven-0.9.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:58703ae7483b46a5e05d2d3f2cac2e345b96b57faaebfe09c5890eb5346daf31"}, + {file = "polyleven-0.9.0-cp39-cp39-win32.whl", hash = "sha256:92a0d2e4d6230f2ccc14d12d11cb496d5d5b81d975841bfed9dce6d11cf90826"}, + {file = "polyleven-0.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:1d651a6714caf4d144f8cb0bd6b1eb043a2ca80dd7c6d87b8f8020edc1729149"}, + {file = "polyleven-0.9.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:0a59f3cf5297e22aac73cf439e1e9cb0703af1adc853fb911637172db09bddec"}, + {file = "polyleven-0.9.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3c8581d8eae56d0e0e3cce33384b4365ef29a924f48edc6b3b5a694412c4b7d"}, + {file = "polyleven-0.9.0-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:603f0ea18dc0826f7078c14484c227dcdb61ca8e4485d0b67f2df317a3a01726"}, + {file = "polyleven-0.9.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:8cf8ff07ea44947e9a34ab371a3b0fec4d2328957332185445cfdd1675539cb9"}, + {file = "polyleven-0.9.0-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:cf4fb8f5be74b9bf7e6f7c2014ee153dc4208af337b781cf3aafc5f51a647d80"}, + {file = "polyleven-0.9.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f21e6c050f6f0d259cf9c6367042ba6a69e553b8294143c83bb47f6481486f9c"}, + {file = "polyleven-0.9.0-pp38-pypy38_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c74d8cba499541fe96e96a76cb8ac2bac7f3d7efeb8c2cec1bf1383c91790f4"}, + {file = "polyleven-0.9.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:5260411e820a858728d32f161690a54bc2162644dba8f4e2b0dd72707d00ac20"}, + {file = "polyleven-0.9.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:81ae9a154c82d53ff67d6cd6b4ee96de3e449f2c8cccd49aaa62b50f6e57a4eb"}, + {file = "polyleven-0.9.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ef398fe2759f84a6c088320742f09ecef5904e5c1f60668eed08f431221c5239"}, + {file = "polyleven-0.9.0-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a3163f6c7ad192ee14ef760b1dd3143a3107c483a327dcfb5e6c94d4c8217fa4"}, + {file = "polyleven-0.9.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:87ef064bfe4a1b13414e440f56a716096375ec93cf1351bed9a84942c230c715"}, + {file = "polyleven-0.9.0.tar.gz", hash = "sha256:299a93766761b5e5fb4092388f3dc6401224fd436c05f11c4ee48b262587e8da"}, +] + [[package]] name = "pre-commit" version = "3.8.0" description = "A framework for managing and maintaining multi-language pre-commit hooks." optional = false python-versions = ">=3.9" -groups = ["dev"] files = [ {file = "pre_commit-3.8.0-py2.py3-none-any.whl", hash = "sha256:9a90a53bf82fdd8778d58085faf8d83df56e40dfe18f45b19446e26bf1b3a63f"}, {file = "pre_commit-3.8.0.tar.gz", hash = "sha256:8bb6494d4a20423842e198980c9ecf9f96607a07ea29549e180eef9ae80fe7af"}, @@ -1338,7 +1420,6 @@ version = "6.32.0" description = "" optional = false python-versions = ">=3.9" -groups = ["main"] files = [ {file = "protobuf-6.32.0-cp310-abi3-win32.whl", hash = "sha256:84f9e3c1ff6fb0308dbacb0950d8aa90694b0d0ee68e75719cb044b7078fe741"}, {file = "protobuf-6.32.0-cp310-abi3-win_amd64.whl", hash = "sha256:a8bdbb2f009cfc22a36d031f22a625a38b615b5e19e558a7b756b3279723e68e"}, @@ -1357,7 +1438,6 @@ version = "2.11.7" description = "Data validation using Python type hints" optional = false python-versions = ">=3.9" -groups = ["main", "dev"] files = [ {file = "pydantic-2.11.7-py3-none-any.whl", hash = "sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b"}, {file = "pydantic-2.11.7.tar.gz", hash = "sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db"}, @@ -1371,7 +1451,7 @@ typing-inspection = ">=0.4.0" [package.extras] email = ["email-validator (>=2.0.0)"] -timezone = ["tzdata ; python_version >= \"3.9\" and platform_system == \"Windows\""] +timezone = ["tzdata"] [[package]] name = "pydantic-core" @@ -1379,7 +1459,6 @@ version = "2.33.2" description = "Core functionality for Pydantic validation and serialization" optional = false python-versions = ">=3.9" -groups = ["main", "dev"] files = [ {file = "pydantic_core-2.33.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8"}, {file = "pydantic_core-2.33.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d"}, @@ -1491,7 +1570,6 @@ version = "2.19.2" description = "Pygments is a syntax highlighting package written in Python." optional = false python-versions = ">=3.8" -groups = ["dev", "docs"] files = [ {file = "pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b"}, {file = "pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887"}, @@ -1506,7 +1584,6 @@ version = "8.4.1" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.9" -groups = ["dev"] files = [ {file = "pytest-8.4.1-py3-none-any.whl", hash = "sha256:539c70ba6fcead8e78eebbf1115e8b589e7565830d7d006a8723f19ac8a0afb7"}, {file = "pytest-8.4.1.tar.gz", hash = "sha256:7c67fd69174877359ed9371ec3af8a3d2b04741818c51e5e99cc1742251fa93c"}, @@ -1530,7 +1607,6 @@ version = "1.1.0" description = "Pytest support for asyncio" optional = false python-versions = ">=3.9" -groups = ["dev"] files = [ {file = "pytest_asyncio-1.1.0-py3-none-any.whl", hash = "sha256:5fe2d69607b0bd75c656d1211f969cadba035030156745ee09e7d71740e58ecf"}, {file = "pytest_asyncio-1.1.0.tar.gz", hash = "sha256:796aa822981e01b68c12e4827b8697108f7205020f24b5793b3c41555dab68ea"}, @@ -1551,7 +1627,6 @@ version = "1.1.3" description = "pytest-httpserver is a httpserver for pytest" optional = false python-versions = ">=3.9" -groups = ["dev"] files = [ {file = "pytest_httpserver-1.1.3-py3-none-any.whl", hash = "sha256:5f84757810233e19e2bb5287f3826a71c97a3740abe3a363af9155c0f82fdbb9"}, {file = "pytest_httpserver-1.1.3.tar.gz", hash = "sha256:af819d6b533f84b4680b9416a5b3f67f1df3701f1da54924afd4d6e4ba5917ec"}, @@ -1566,7 +1641,6 @@ version = "2.4.0" description = "pytest plugin to abort hanging tests" optional = false python-versions = ">=3.7" -groups = ["dev"] files = [ {file = "pytest_timeout-2.4.0-py3-none-any.whl", hash = "sha256:c42667e5cdadb151aeb5b26d114aff6bdf5a907f176a007a30b940d3d865b5c2"}, {file = "pytest_timeout-2.4.0.tar.gz", hash = "sha256:7e68e90b01f9eff71332b25001f85c75495fc4e3a836701876183c4bcfd0540a"}, @@ -1581,7 +1655,6 @@ version = "3.8.0" description = "pytest xdist plugin for distributed testing, most importantly across multiple CPUs" optional = false python-versions = ">=3.9" -groups = ["dev"] files = [ {file = "pytest_xdist-3.8.0-py3-none-any.whl", hash = "sha256:202ca578cfeb7370784a8c33d6d05bc6e13b4f25b5053c30a152269fd10f0b88"}, {file = "pytest_xdist-3.8.0.tar.gz", hash = "sha256:7e578125ec9bc6050861aa93f2d59f1d8d085595d6551c2c90b6f4fad8d3a9f1"}, @@ -1602,7 +1675,6 @@ version = "6.0.2" description = "YAML parser and emitter for Python" optional = false python-versions = ">=3.8" -groups = ["main", "dev"] files = [ {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"}, {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"}, @@ -1658,7 +1730,22 @@ files = [ {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"}, {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"}, ] -markers = {main = "extra == \"langchain\""} + +[[package]] +name = "referencing" +version = "0.36.2" +description = "JSON Referencing + Python" +optional = false +python-versions = ">=3.9" +files = [ + {file = "referencing-0.36.2-py3-none-any.whl", hash = "sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0"}, + {file = "referencing-0.36.2.tar.gz", hash = "sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa"}, +] + +[package.dependencies] +attrs = ">=22.2.0" +rpds-py = ">=0.7.0" +typing-extensions = {version = ">=4.4.0", markers = "python_version < \"3.13\""} [[package]] name = "regex" @@ -1666,7 +1753,6 @@ version = "2025.7.34" description = "Alternative regular expression module, to replace re." optional = false python-versions = ">=3.9" -groups = ["dev"] files = [ {file = "regex-2025.7.34-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d856164d25e2b3b07b779bfed813eb4b6b6ce73c2fd818d46f47c1eb5cd79bd6"}, {file = "regex-2025.7.34-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2d15a9da5fad793e35fb7be74eec450d968e05d2e294f3e0e77ab03fa7234a83"}, @@ -1763,7 +1849,6 @@ version = "2.32.5" description = "Python HTTP for Humans." optional = false python-versions = ">=3.9" -groups = ["main", "dev"] files = [ {file = "requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6"}, {file = "requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf"}, @@ -1785,23 +1870,184 @@ version = "1.0.0" description = "A utility belt for advanced users of python-requests" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -groups = ["main", "dev"] files = [ {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"}, {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"}, ] -markers = {main = "extra == \"langchain\""} [package.dependencies] requests = ">=2.0.1,<3.0.0" +[[package]] +name = "rpds-py" +version = "0.27.1" +description = "Python bindings to Rust's persistent data structures (rpds)" +optional = false +python-versions = ">=3.9" +files = [ + {file = "rpds_py-0.27.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:68afeec26d42ab3b47e541b272166a0b4400313946871cba3ed3a4fc0cab1cef"}, + {file = "rpds_py-0.27.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:74e5b2f7bb6fa38b1b10546d27acbacf2a022a8b5543efb06cfebc72a59c85be"}, + {file = "rpds_py-0.27.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9024de74731df54546fab0bfbcdb49fae19159ecaecfc8f37c18d2c7e2c0bd61"}, + {file = "rpds_py-0.27.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:31d3ebadefcd73b73928ed0b2fd696f7fefda8629229f81929ac9c1854d0cffb"}, + {file = "rpds_py-0.27.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b2e7f8f169d775dd9092a1743768d771f1d1300453ddfe6325ae3ab5332b4657"}, + {file = "rpds_py-0.27.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d905d16f77eb6ab2e324e09bfa277b4c8e5e6b8a78a3e7ff8f3cdf773b4c013"}, + {file = "rpds_py-0.27.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50c946f048209e6362e22576baea09193809f87687a95a8db24e5fbdb307b93a"}, + {file = "rpds_py-0.27.1-cp310-cp310-manylinux_2_31_riscv64.whl", hash = "sha256:3deab27804d65cd8289eb814c2c0e807c4b9d9916c9225e363cb0cf875eb67c1"}, + {file = "rpds_py-0.27.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8b61097f7488de4be8244c89915da8ed212832ccf1e7c7753a25a394bf9b1f10"}, + {file = "rpds_py-0.27.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8a3f29aba6e2d7d90528d3c792555a93497fe6538aa65eb675b44505be747808"}, + {file = "rpds_py-0.27.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:dd6cd0485b7d347304067153a6dc1d73f7d4fd995a396ef32a24d24b8ac63ac8"}, + {file = "rpds_py-0.27.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:6f4461bf931108c9fa226ffb0e257c1b18dc2d44cd72b125bec50ee0ab1248a9"}, + {file = "rpds_py-0.27.1-cp310-cp310-win32.whl", hash = "sha256:ee5422d7fb21f6a00c1901bf6559c49fee13a5159d0288320737bbf6585bd3e4"}, + {file = "rpds_py-0.27.1-cp310-cp310-win_amd64.whl", hash = "sha256:3e039aabf6d5f83c745d5f9a0a381d031e9ed871967c0a5c38d201aca41f3ba1"}, + {file = "rpds_py-0.27.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:be898f271f851f68b318872ce6ebebbc62f303b654e43bf72683dbdc25b7c881"}, + {file = "rpds_py-0.27.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:62ac3d4e3e07b58ee0ddecd71d6ce3b1637de2d373501412df395a0ec5f9beb5"}, + {file = "rpds_py-0.27.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4708c5c0ceb2d034f9991623631d3d23cb16e65c83736ea020cdbe28d57c0a0e"}, + {file = "rpds_py-0.27.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:abfa1171a9952d2e0002aba2ad3780820b00cc3d9c98c6630f2e93271501f66c"}, + {file = "rpds_py-0.27.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4b507d19f817ebaca79574b16eb2ae412e5c0835542c93fe9983f1e432aca195"}, + {file = "rpds_py-0.27.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:168b025f8fd8d8d10957405f3fdcef3dc20f5982d398f90851f4abc58c566c52"}, + {file = "rpds_py-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb56c6210ef77caa58e16e8c17d35c63fe3f5b60fd9ba9d424470c3400bcf9ed"}, + {file = "rpds_py-0.27.1-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:d252f2d8ca0195faa707f8eb9368955760880b2b42a8ee16d382bf5dd807f89a"}, + {file = "rpds_py-0.27.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6e5e54da1e74b91dbc7996b56640f79b195d5925c2b78efaa8c5d53e1d88edde"}, + {file = "rpds_py-0.27.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ffce0481cc6e95e5b3f0a47ee17ffbd234399e6d532f394c8dce320c3b089c21"}, + {file = "rpds_py-0.27.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:a205fdfe55c90c2cd8e540ca9ceba65cbe6629b443bc05db1f590a3db8189ff9"}, + {file = "rpds_py-0.27.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:689fb5200a749db0415b092972e8eba85847c23885c8543a8b0f5c009b1a5948"}, + {file = "rpds_py-0.27.1-cp311-cp311-win32.whl", hash = "sha256:3182af66048c00a075010bc7f4860f33913528a4b6fc09094a6e7598e462fe39"}, + {file = "rpds_py-0.27.1-cp311-cp311-win_amd64.whl", hash = "sha256:b4938466c6b257b2f5c4ff98acd8128ec36b5059e5c8f8372d79316b1c36bb15"}, + {file = "rpds_py-0.27.1-cp311-cp311-win_arm64.whl", hash = "sha256:2f57af9b4d0793e53266ee4325535a31ba48e2f875da81a9177c9926dfa60746"}, + {file = "rpds_py-0.27.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:ae2775c1973e3c30316892737b91f9283f9908e3cc7625b9331271eaaed7dc90"}, + {file = "rpds_py-0.27.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2643400120f55c8a96f7c9d858f7be0c88d383cd4653ae2cf0d0c88f668073e5"}, + {file = "rpds_py-0.27.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:16323f674c089b0360674a4abd28d5042947d54ba620f72514d69be4ff64845e"}, + {file = "rpds_py-0.27.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9a1f4814b65eacac94a00fc9a526e3fdafd78e439469644032032d0d63de4881"}, + {file = "rpds_py-0.27.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7ba32c16b064267b22f1850a34051121d423b6f7338a12b9459550eb2096e7ec"}, + {file = "rpds_py-0.27.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5c20f33fd10485b80f65e800bbe5f6785af510b9f4056c5a3c612ebc83ba6cb"}, + {file = "rpds_py-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:466bfe65bd932da36ff279ddd92de56b042f2266d752719beb97b08526268ec5"}, + {file = "rpds_py-0.27.1-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:41e532bbdcb57c92ba3be62c42e9f096431b4cf478da9bc3bc6ce5c38ab7ba7a"}, + {file = "rpds_py-0.27.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f149826d742b406579466283769a8ea448eed82a789af0ed17b0cd5770433444"}, + {file = "rpds_py-0.27.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:80c60cfb5310677bd67cb1e85a1e8eb52e12529545441b43e6f14d90b878775a"}, + {file = "rpds_py-0.27.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:7ee6521b9baf06085f62ba9c7a3e5becffbc32480d2f1b351559c001c38ce4c1"}, + {file = "rpds_py-0.27.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a512c8263249a9d68cac08b05dd59d2b3f2061d99b322813cbcc14c3c7421998"}, + {file = "rpds_py-0.27.1-cp312-cp312-win32.whl", hash = "sha256:819064fa048ba01b6dadc5116f3ac48610435ac9a0058bbde98e569f9e785c39"}, + {file = "rpds_py-0.27.1-cp312-cp312-win_amd64.whl", hash = "sha256:d9199717881f13c32c4046a15f024971a3b78ad4ea029e8da6b86e5aa9cf4594"}, + {file = "rpds_py-0.27.1-cp312-cp312-win_arm64.whl", hash = "sha256:33aa65b97826a0e885ef6e278fbd934e98cdcfed80b63946025f01e2f5b29502"}, + {file = "rpds_py-0.27.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:e4b9fcfbc021633863a37e92571d6f91851fa656f0180246e84cbd8b3f6b329b"}, + {file = "rpds_py-0.27.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1441811a96eadca93c517d08df75de45e5ffe68aa3089924f963c782c4b898cf"}, + {file = "rpds_py-0.27.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:55266dafa22e672f5a4f65019015f90336ed31c6383bd53f5e7826d21a0e0b83"}, + {file = "rpds_py-0.27.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d78827d7ac08627ea2c8e02c9e5b41180ea5ea1f747e9db0915e3adf36b62dcf"}, + {file = "rpds_py-0.27.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae92443798a40a92dc5f0b01d8a7c93adde0c4dc965310a29ae7c64d72b9fad2"}, + {file = "rpds_py-0.27.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c46c9dd2403b66a2a3b9720ec4b74d4ab49d4fabf9f03dfdce2d42af913fe8d0"}, + {file = "rpds_py-0.27.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2efe4eb1d01b7f5f1939f4ef30ecea6c6b3521eec451fb93191bf84b2a522418"}, + {file = "rpds_py-0.27.1-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:15d3b4d83582d10c601f481eca29c3f138d44c92187d197aff663a269197c02d"}, + {file = "rpds_py-0.27.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4ed2e16abbc982a169d30d1a420274a709949e2cbdef119fe2ec9d870b42f274"}, + {file = "rpds_py-0.27.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a75f305c9b013289121ec0f1181931975df78738cdf650093e6b86d74aa7d8dd"}, + {file = "rpds_py-0.27.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:67ce7620704745881a3d4b0ada80ab4d99df390838839921f99e63c474f82cf2"}, + {file = "rpds_py-0.27.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9d992ac10eb86d9b6f369647b6a3f412fc0075cfd5d799530e84d335e440a002"}, + {file = "rpds_py-0.27.1-cp313-cp313-win32.whl", hash = "sha256:4f75e4bd8ab8db624e02c8e2fc4063021b58becdbe6df793a8111d9343aec1e3"}, + {file = "rpds_py-0.27.1-cp313-cp313-win_amd64.whl", hash = "sha256:f9025faafc62ed0b75a53e541895ca272815bec18abe2249ff6501c8f2e12b83"}, + {file = "rpds_py-0.27.1-cp313-cp313-win_arm64.whl", hash = "sha256:ed10dc32829e7d222b7d3b93136d25a406ba9788f6a7ebf6809092da1f4d279d"}, + {file = "rpds_py-0.27.1-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:92022bbbad0d4426e616815b16bc4127f83c9a74940e1ccf3cfe0b387aba0228"}, + {file = "rpds_py-0.27.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:47162fdab9407ec3f160805ac3e154df042e577dd53341745fc7fb3f625e6d92"}, + {file = "rpds_py-0.27.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb89bec23fddc489e5d78b550a7b773557c9ab58b7946154a10a6f7a214a48b2"}, + {file = "rpds_py-0.27.1-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e48af21883ded2b3e9eb48cb7880ad8598b31ab752ff3be6457001d78f416723"}, + {file = "rpds_py-0.27.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6f5b7bd8e219ed50299e58551a410b64daafb5017d54bbe822e003856f06a802"}, + {file = "rpds_py-0.27.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:08f1e20bccf73b08d12d804d6e1c22ca5530e71659e6673bce31a6bb71c1e73f"}, + {file = "rpds_py-0.27.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0dc5dceeaefcc96dc192e3a80bbe1d6c410c469e97bdd47494a7d930987f18b2"}, + {file = "rpds_py-0.27.1-cp313-cp313t-manylinux_2_31_riscv64.whl", hash = "sha256:d76f9cc8665acdc0c9177043746775aa7babbf479b5520b78ae4002d889f5c21"}, + {file = "rpds_py-0.27.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:134fae0e36022edad8290a6661edf40c023562964efea0cc0ec7f5d392d2aaef"}, + {file = "rpds_py-0.27.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:eb11a4f1b2b63337cfd3b4d110af778a59aae51c81d195768e353d8b52f88081"}, + {file = "rpds_py-0.27.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:13e608ac9f50a0ed4faec0e90ece76ae33b34c0e8656e3dceb9a7db994c692cd"}, + {file = "rpds_py-0.27.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:dd2135527aa40f061350c3f8f89da2644de26cd73e4de458e79606384f4f68e7"}, + {file = "rpds_py-0.27.1-cp313-cp313t-win32.whl", hash = "sha256:3020724ade63fe320a972e2ffd93b5623227e684315adce194941167fee02688"}, + {file = "rpds_py-0.27.1-cp313-cp313t-win_amd64.whl", hash = "sha256:8ee50c3e41739886606388ba3ab3ee2aae9f35fb23f833091833255a31740797"}, + {file = "rpds_py-0.27.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:acb9aafccaae278f449d9c713b64a9e68662e7799dbd5859e2c6b3c67b56d334"}, + {file = "rpds_py-0.27.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:b7fb801aa7f845ddf601c49630deeeccde7ce10065561d92729bfe81bd21fb33"}, + {file = "rpds_py-0.27.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fe0dd05afb46597b9a2e11c351e5e4283c741237e7f617ffb3252780cca9336a"}, + {file = "rpds_py-0.27.1-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b6dfb0e058adb12d8b1d1b25f686e94ffa65d9995a5157afe99743bf7369d62b"}, + {file = "rpds_py-0.27.1-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ed090ccd235f6fa8bb5861684567f0a83e04f52dfc2e5c05f2e4b1309fcf85e7"}, + {file = "rpds_py-0.27.1-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bf876e79763eecf3e7356f157540d6a093cef395b65514f17a356f62af6cc136"}, + {file = "rpds_py-0.27.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:12ed005216a51b1d6e2b02a7bd31885fe317e45897de81d86dcce7d74618ffff"}, + {file = "rpds_py-0.27.1-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:ee4308f409a40e50593c7e3bb8cbe0b4d4c66d1674a316324f0c2f5383b486f9"}, + {file = "rpds_py-0.27.1-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0b08d152555acf1f455154d498ca855618c1378ec810646fcd7c76416ac6dc60"}, + {file = "rpds_py-0.27.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:dce51c828941973a5684d458214d3a36fcd28da3e1875d659388f4f9f12cc33e"}, + {file = "rpds_py-0.27.1-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:c1476d6f29eb81aa4151c9a31219b03f1f798dc43d8af1250a870735516a1212"}, + {file = "rpds_py-0.27.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:3ce0cac322b0d69b63c9cdb895ee1b65805ec9ffad37639f291dd79467bee675"}, + {file = "rpds_py-0.27.1-cp314-cp314-win32.whl", hash = "sha256:dfbfac137d2a3d0725758cd141f878bf4329ba25e34979797c89474a89a8a3a3"}, + {file = "rpds_py-0.27.1-cp314-cp314-win_amd64.whl", hash = "sha256:a6e57b0abfe7cc513450fcf529eb486b6e4d3f8aee83e92eb5f1ef848218d456"}, + {file = "rpds_py-0.27.1-cp314-cp314-win_arm64.whl", hash = "sha256:faf8d146f3d476abfee026c4ae3bdd9ca14236ae4e4c310cbd1cf75ba33d24a3"}, + {file = "rpds_py-0.27.1-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:ba81d2b56b6d4911ce735aad0a1d4495e808b8ee4dc58715998741a26874e7c2"}, + {file = "rpds_py-0.27.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:84f7d509870098de0e864cad0102711c1e24e9b1a50ee713b65928adb22269e4"}, + {file = "rpds_py-0.27.1-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9e960fc78fecd1100539f14132425e1d5fe44ecb9239f8f27f079962021523e"}, + {file = "rpds_py-0.27.1-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:62f85b665cedab1a503747617393573995dac4600ff51869d69ad2f39eb5e817"}, + {file = "rpds_py-0.27.1-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fed467af29776f6556250c9ed85ea5a4dd121ab56a5f8b206e3e7a4c551e48ec"}, + {file = "rpds_py-0.27.1-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2729615f9d430af0ae6b36cf042cb55c0936408d543fb691e1a9e36648fd35a"}, + {file = "rpds_py-0.27.1-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1b207d881a9aef7ba753d69c123a35d96ca7cb808056998f6b9e8747321f03b8"}, + {file = "rpds_py-0.27.1-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:639fd5efec029f99b79ae47e5d7e00ad8a773da899b6309f6786ecaf22948c48"}, + {file = "rpds_py-0.27.1-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fecc80cb2a90e28af8a9b366edacf33d7a91cbfe4c2c4544ea1246e949cfebeb"}, + {file = "rpds_py-0.27.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:42a89282d711711d0a62d6f57d81aa43a1368686c45bc1c46b7f079d55692734"}, + {file = "rpds_py-0.27.1-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:cf9931f14223de59551ab9d38ed18d92f14f055a5f78c1d8ad6493f735021bbb"}, + {file = "rpds_py-0.27.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:f39f58a27cc6e59f432b568ed8429c7e1641324fbe38131de852cd77b2d534b0"}, + {file = "rpds_py-0.27.1-cp314-cp314t-win32.whl", hash = "sha256:d5fa0ee122dc09e23607a28e6d7b150da16c662e66409bbe85230e4c85bb528a"}, + {file = "rpds_py-0.27.1-cp314-cp314t-win_amd64.whl", hash = "sha256:6567d2bb951e21232c2f660c24cf3470bb96de56cdcb3f071a83feeaff8a2772"}, + {file = "rpds_py-0.27.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:c918c65ec2e42c2a78d19f18c553d77319119bf43aa9e2edf7fb78d624355527"}, + {file = "rpds_py-0.27.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1fea2b1a922c47c51fd07d656324531adc787e415c8b116530a1d29c0516c62d"}, + {file = "rpds_py-0.27.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bbf94c58e8e0cd6b6f38d8de67acae41b3a515c26169366ab58bdca4a6883bb8"}, + {file = "rpds_py-0.27.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c2a8fed130ce946d5c585eddc7c8eeef0051f58ac80a8ee43bd17835c144c2cc"}, + {file = "rpds_py-0.27.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:037a2361db72ee98d829bc2c5b7cc55598ae0a5e0ec1823a56ea99374cfd73c1"}, + {file = "rpds_py-0.27.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5281ed1cc1d49882f9997981c88df1a22e140ab41df19071222f7e5fc4e72125"}, + {file = "rpds_py-0.27.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fd50659a069c15eef8aa3d64bbef0d69fd27bb4a50c9ab4f17f83a16cbf8905"}, + {file = "rpds_py-0.27.1-cp39-cp39-manylinux_2_31_riscv64.whl", hash = "sha256:c4b676c4ae3921649a15d28ed10025548e9b561ded473aa413af749503c6737e"}, + {file = "rpds_py-0.27.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:079bc583a26db831a985c5257797b2b5d3affb0386e7ff886256762f82113b5e"}, + {file = "rpds_py-0.27.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4e44099bd522cba71a2c6b97f68e19f40e7d85399de899d66cdb67b32d7cb786"}, + {file = "rpds_py-0.27.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e202e6d4188e53c6661af813b46c37ca2c45e497fc558bacc1a7630ec2695aec"}, + {file = "rpds_py-0.27.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:f41f814b8eaa48768d1bb551591f6ba45f87ac76899453e8ccd41dba1289b04b"}, + {file = "rpds_py-0.27.1-cp39-cp39-win32.whl", hash = "sha256:9e71f5a087ead99563c11fdaceee83ee982fd39cf67601f4fd66cb386336ee52"}, + {file = "rpds_py-0.27.1-cp39-cp39-win_amd64.whl", hash = "sha256:71108900c9c3c8590697244b9519017a400d9ba26a36c48381b3f64743a44aab"}, + {file = "rpds_py-0.27.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:7ba22cb9693df986033b91ae1d7a979bc399237d45fccf875b76f62bb9e52ddf"}, + {file = "rpds_py-0.27.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:5b640501be9288c77738b5492b3fd3abc4ba95c50c2e41273c8a1459f08298d3"}, + {file = "rpds_py-0.27.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb08b65b93e0c6dd70aac7f7890a9c0938d5ec71d5cb32d45cf844fb8ae47636"}, + {file = "rpds_py-0.27.1-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d7ff07d696a7a38152ebdb8212ca9e5baab56656749f3d6004b34ab726b550b8"}, + {file = "rpds_py-0.27.1-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fb7c72262deae25366e3b6c0c0ba46007967aea15d1eea746e44ddba8ec58dcc"}, + {file = "rpds_py-0.27.1-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b002cab05d6339716b03a4a3a2ce26737f6231d7b523f339fa061d53368c9d8"}, + {file = "rpds_py-0.27.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23f6b69d1c26c4704fec01311963a41d7de3ee0570a84ebde4d544e5a1859ffc"}, + {file = "rpds_py-0.27.1-pp310-pypy310_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:530064db9146b247351f2a0250b8f00b289accea4596a033e94be2389977de71"}, + {file = "rpds_py-0.27.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7b90b0496570bd6b0321724a330d8b545827c4df2034b6ddfc5f5275f55da2ad"}, + {file = "rpds_py-0.27.1-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:879b0e14a2da6a1102a3fc8af580fc1ead37e6d6692a781bd8c83da37429b5ab"}, + {file = "rpds_py-0.27.1-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:0d807710df3b5faa66c731afa162ea29717ab3be17bdc15f90f2d9f183da4059"}, + {file = "rpds_py-0.27.1-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:3adc388fc3afb6540aec081fa59e6e0d3908722771aa1e37ffe22b220a436f0b"}, + {file = "rpds_py-0.27.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c796c0c1cc68cb08b0284db4229f5af76168172670c74908fdbd4b7d7f515819"}, + {file = "rpds_py-0.27.1-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:cdfe4bb2f9fe7458b7453ad3c33e726d6d1c7c0a72960bcc23800d77384e42df"}, + {file = "rpds_py-0.27.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:8fabb8fd848a5f75a2324e4a84501ee3a5e3c78d8603f83475441866e60b94a3"}, + {file = "rpds_py-0.27.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eda8719d598f2f7f3e0f885cba8646644b55a187762bec091fa14a2b819746a9"}, + {file = "rpds_py-0.27.1-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3c64d07e95606ec402a0a1c511fe003873fa6af630bda59bac77fac8b4318ebc"}, + {file = "rpds_py-0.27.1-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:93a2ed40de81bcff59aabebb626562d48332f3d028ca2036f1d23cbb52750be4"}, + {file = "rpds_py-0.27.1-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:387ce8c44ae94e0ec50532d9cb0edce17311024c9794eb196b90e1058aadeb66"}, + {file = "rpds_py-0.27.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aaf94f812c95b5e60ebaf8bfb1898a7d7cb9c1af5744d4a67fa47796e0465d4e"}, + {file = "rpds_py-0.27.1-pp311-pypy311_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:4848ca84d6ded9b58e474dfdbad4b8bfb450344c0551ddc8d958bf4b36aa837c"}, + {file = "rpds_py-0.27.1-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2bde09cbcf2248b73c7c323be49b280180ff39fadcfe04e7b6f54a678d02a7cf"}, + {file = "rpds_py-0.27.1-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:94c44ee01fd21c9058f124d2d4f0c9dc7634bec93cd4b38eefc385dabe71acbf"}, + {file = "rpds_py-0.27.1-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:df8b74962e35c9249425d90144e721eed198e6555a0e22a563d29fe4486b51f6"}, + {file = "rpds_py-0.27.1-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:dc23e6820e3b40847e2f4a7726462ba0cf53089512abe9ee16318c366494c17a"}, + {file = "rpds_py-0.27.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:aa8933159edc50be265ed22b401125c9eebff3171f570258854dbce3ecd55475"}, + {file = "rpds_py-0.27.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:a50431bf02583e21bf273c71b89d710e7a710ad5e39c725b14e685610555926f"}, + {file = "rpds_py-0.27.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78af06ddc7fe5cc0e967085a9115accee665fb912c22a3f54bad70cc65b05fe6"}, + {file = "rpds_py-0.27.1-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:70d0738ef8fee13c003b100c2fbd667ec4f133468109b3472d249231108283a3"}, + {file = "rpds_py-0.27.1-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e2f6fd8a1cea5bbe599b6e78a6e5ee08db434fc8ffea51ff201c8765679698b3"}, + {file = "rpds_py-0.27.1-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8177002868d1426305bb5de1e138161c2ec9eb2d939be38291d7c431c4712df8"}, + {file = "rpds_py-0.27.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:008b839781d6c9bf3b6a8984d1d8e56f0ec46dc56df61fd669c49b58ae800400"}, + {file = "rpds_py-0.27.1-pp39-pypy39_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:a55b9132bb1ade6c734ddd2759c8dc132aa63687d259e725221f106b83a0e485"}, + {file = "rpds_py-0.27.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a46fdec0083a26415f11d5f236b79fa1291c32aaa4a17684d82f7017a1f818b1"}, + {file = "rpds_py-0.27.1-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:8a63b640a7845f2bdd232eb0d0a4a2dd939bcdd6c57e6bb134526487f3160ec5"}, + {file = "rpds_py-0.27.1-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:7e32721e5d4922deaaf963469d795d5bde6093207c52fec719bd22e5d1bedbc4"}, + {file = "rpds_py-0.27.1-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:2c426b99a068601b5f4623573df7a7c3d72e87533a2dd2253353a03e7502566c"}, + {file = "rpds_py-0.27.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:4fc9b7fe29478824361ead6e14e4f5aed570d477e06088826537e202d25fe859"}, + {file = "rpds_py-0.27.1.tar.gz", hash = "sha256:26a1c73171d10b7acccbded82bf6a586ab8203601e565badc74bbbf8bc5a10f8"}, +] + [[package]] name = "ruff" version = "0.12.11" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" -groups = ["dev"] files = [ {file = "ruff-0.12.11-py3-none-linux_armv6l.whl", hash = "sha256:93fce71e1cac3a8bf9200e63a38ac5c078f3b6baebffb74ba5274fb2ab276065"}, {file = "ruff-0.12.11-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:b8e33ac7b28c772440afa80cebb972ffd823621ded90404f29e5ab6d1e2d4b93"}, @@ -1830,7 +2076,6 @@ version = "1.3.1" description = "Sniff out which async library your code is running under" optional = false python-versions = ">=3.7" -groups = ["main", "dev"] files = [ {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"}, {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, @@ -1842,8 +2087,6 @@ version = "2.0.43" description = "Database Abstraction Library" optional = true python-versions = ">=3.7" -groups = ["main"] -markers = "extra == \"langchain\"" files = [ {file = "SQLAlchemy-2.0.43-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:21ba7a08a4253c5825d1db389d4299f64a100ef9800e4624c8bf70d8f136e6ed"}, {file = "SQLAlchemy-2.0.43-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:11b9503fa6f8721bef9b8567730f664c5a5153d25e247aadc69247c4bc605227"}, @@ -1939,12 +2182,10 @@ version = "9.1.2" description = "Retry code until it succeeds" optional = false python-versions = ">=3.9" -groups = ["main", "dev"] files = [ {file = "tenacity-9.1.2-py3-none-any.whl", hash = "sha256:f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138"}, {file = "tenacity-9.1.2.tar.gz", hash = "sha256:1169d376c297e7de388d18b4481760d478b0e99a777cad3a9c86e556f4b697cb"}, ] -markers = {main = "extra == \"langchain\""} [package.extras] doc = ["reno", "sphinx"] @@ -1956,7 +2197,6 @@ version = "0.11.0" description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models" optional = false python-versions = ">=3.9" -groups = ["dev"] files = [ {file = "tiktoken-0.11.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:8a9b517d6331d7103f8bef29ef93b3cca95fa766e293147fe7bacddf310d5917"}, {file = "tiktoken-0.11.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b4ddb1849e6bf0afa6cc1c5d809fb980ca240a5fffe585a04e119519758788c0"}, @@ -2004,8 +2244,6 @@ version = "2.2.1" description = "A lil' TOML parser" optional = false python-versions = ">=3.8" -groups = ["dev"] -markers = "python_version < \"3.11\"" files = [ {file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"}, {file = "tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6"}, @@ -2047,12 +2285,10 @@ version = "4.67.1" description = "Fast, Extensible Progress Meter" optional = false python-versions = ">=3.7" -groups = ["main", "dev"] files = [ {file = "tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2"}, {file = "tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2"}, ] -markers = {main = "extra == \"openai\""} [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} @@ -2070,7 +2306,6 @@ version = "4.15.0" description = "Backported and Experimental Type Hints for Python 3.9+" optional = false python-versions = ">=3.9" -groups = ["main", "dev"] files = [ {file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"}, {file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"}, @@ -2082,7 +2317,6 @@ version = "0.4.1" description = "Runtime typing introspection tools" optional = false python-versions = ">=3.9" -groups = ["main", "dev"] files = [ {file = "typing_inspection-0.4.1-py3-none-any.whl", hash = "sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51"}, {file = "typing_inspection-0.4.1.tar.gz", hash = "sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28"}, @@ -2097,14 +2331,13 @@ version = "2.5.0" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" -groups = ["main", "dev"] files = [ {file = "urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc"}, {file = "urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760"}, ] [package.extras] -brotli = ["brotli (>=1.0.9) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\""] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] h2 = ["h2 (>=4,<5)"] socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] @@ -2115,7 +2348,6 @@ version = "20.34.0" description = "Virtual Python Environment builder" optional = false python-versions = ">=3.8" -groups = ["dev"] files = [ {file = "virtualenv-20.34.0-py3-none-any.whl", hash = "sha256:341f5afa7eee943e4984a9207c025feedd768baff6753cd660c857ceb3e36026"}, {file = "virtualenv-20.34.0.tar.gz", hash = "sha256:44815b2c9dee7ed86e387b842a84f20b93f7f417f95886ca1996a72a4138eb1a"}, @@ -2129,7 +2361,7 @@ typing-extensions = {version = ">=4.13.2", markers = "python_version < \"3.11\"" [package.extras] docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] -test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8) ; platform_python_implementation == \"PyPy\" or platform_python_implementation == \"GraalVM\" or platform_python_implementation == \"CPython\" and sys_platform == \"win32\" and python_version >= \"3.13\"", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10) ; platform_python_implementation == \"CPython\""] +test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"] [[package]] name = "werkzeug" @@ -2137,7 +2369,6 @@ version = "3.1.3" description = "The comprehensive WSGI web application library." optional = false python-versions = ">=3.9" -groups = ["dev"] files = [ {file = "werkzeug-3.1.3-py3-none-any.whl", hash = "sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e"}, {file = "werkzeug-3.1.3.tar.gz", hash = "sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746"}, @@ -2155,7 +2386,6 @@ version = "1.17.3" description = "Module for decorators, wrappers and monkey patching." optional = false python-versions = ">=3.8" -groups = ["main"] files = [ {file = "wrapt-1.17.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:88bbae4d40d5a46142e70d58bf664a89b6b4befaea7b2ecc14e03cedb8e06c04"}, {file = "wrapt-1.17.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e6b13af258d6a9ad602d57d889f83b9d5543acd471eee12eb51f5b01f8eb1bc2"}, @@ -2246,7 +2476,6 @@ version = "3.5.0" description = "Python binding for xxHash" optional = false python-versions = ">=3.7" -groups = ["dev"] files = [ {file = "xxhash-3.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ece616532c499ee9afbb83078b1b952beffef121d989841f7f4b3dc5ac0fd212"}, {file = "xxhash-3.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3171f693dbc2cef6477054a665dc255d996646b4023fe56cb4db80e26f4cc520"}, @@ -2379,14 +2608,13 @@ version = "3.23.0" description = "Backport of pathlib-compatible object wrapper for zip files" optional = false python-versions = ">=3.9" -groups = ["main"] files = [ {file = "zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e"}, {file = "zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166"}, ] [package.extras] -check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\""] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] cover = ["pytest-cov"] doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] enabler = ["pytest-enabler (>=2.2)"] @@ -2399,7 +2627,6 @@ version = "0.24.0" description = "Zstandard bindings for Python" optional = false python-versions = ">=3.9" -groups = ["main", "dev"] files = [ {file = "zstandard-0.24.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:af1394c2c5febc44e0bbf0fc6428263fa928b50d1b1982ce1d870dc793a8e5f4"}, {file = "zstandard-0.24.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5e941654cef13a1d53634ec30933722eda11f44f99e1d0bc62bbce3387580d50"}, @@ -2501,16 +2728,15 @@ files = [ {file = "zstandard-0.24.0-cp39-cp39-win_amd64.whl", hash = "sha256:8ecd3b1f7a601f79e0cd20c26057d770219c0dc2f572ea07390248da2def79a4"}, {file = "zstandard-0.24.0.tar.gz", hash = "sha256:fe3198b81c00032326342d973e526803f183f97aa9e9a98e3f897ebafe21178f"}, ] -markers = {main = "extra == \"langchain\""} [package.extras] -cffi = ["cffi (>=1.17) ; python_version >= \"3.13\" and platform_python_implementation != \"PyPy\""] +cffi = ["cffi (>=1.17)"] [extras] langchain = ["langchain"] openai = ["openai"] [metadata] -lock-version = "2.1" +lock-version = "2.0" python-versions = ">=3.9,<4.0" -content-hash = "6fe7fed47d629061be2cfcd2a2ea4c83201e5de130faf5f664d68845c2fea22f" +content-hash = "83ae81e7b9fd90ae8000dc0ac491ff766b899b166a5fc895043d0555267e288c" diff --git a/pyproject.toml b/pyproject.toml index 37ff24c6a..70ab88454 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ ruff = ">=0.1.8,<0.13.0" mypy = "^1.0.0" langchain-openai = ">=0.0.5,<0.4" langgraph = ">=0.2.62,<0.7.0" +autoevals = "^0.0.130" [tool.poetry.group.docs.dependencies] pdoc = "^15.0.4" From 52f7d8038908db80c6e9dd205a34c64950ba838c Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Fri, 12 Sep 2025 14:32:04 +0200 Subject: [PATCH 12/25] push --- langfuse/_client/client.py | 2 + langfuse/_client/experiments.py | 71 +++++++++++++++++++++++++-------- langfuse/experiment.py | 2 + 3 files changed, 58 insertions(+), 17 deletions(-) diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py index 27958b967..45bc773e8 100644 --- a/langfuse/_client/client.py +++ b/langfuse/_client/client.py @@ -2726,6 +2726,8 @@ async def process_item(item: ExperimentItem) -> dict: self.flush() return { + "name": name, + "description": description, "item_results": valid_results, "run_evaluations": run_evaluations, "dataset_run_id": dataset_run_id, diff --git a/langfuse/_client/experiments.py b/langfuse/_client/experiments.py index 2e004d686..249c71d2d 100644 --- a/langfuse/_client/experiments.py +++ b/langfuse/_client/experiments.py @@ -189,6 +189,8 @@ class ExperimentResult(TypedDict): dataset_run_url: URL to view the dataset run in Langfuse UI """ + name: str + description: Optional[str] item_results: List[ExperimentItemResult] run_evaluations: List[Evaluation] dataset_run_id: Optional[str] @@ -578,27 +580,62 @@ def performance_distribution(*, item_results, **kwargs): ... -def format_experiment_results( - item_results: List[ExperimentItemResult], - run_evaluations: List[Evaluation], - experiment_name: str, - experiment_description: Optional[str] = None, - dataset_run_url: Optional[str] = None, +def format_experiment_result( + experiment_result: ExperimentResult, + *, include_item_results: bool = False, ) -> str: - """Format experiment results for display. + """Format an experiment result for human-readable display. + + Takes an ExperimentResult object and converts it into a nicely formatted + string suitable for console output or logging. The output includes experiment + overview, aggregate statistics, and optionally individual item details. Args: - item_results: Results from processing each item - run_evaluations: Results from run-level evaluators - experiment_name: Name of the experiment - experiment_description: Optional description of the experiment - dataset_run_url: Optional URL to dataset run in Langfuse UI - include_item_results: Whether to include individual item details + experiment_result: Complete experiment result containing name, description, + item results, run evaluations, and dataset run information. + include_item_results: Whether to include detailed results for each individual + item in the output. When False (default), only shows aggregate statistics. + Set to True to see input/output/scores for every processed item. Returns: - Formatted string representation of the results + A formatted multi-line string containing: + - Experiment name and description + - Number of items processed + - List of evaluation metrics used + - Average scores across all items + - Run-level evaluation results + - Dataset run URL (if available) + - Individual item details (if include_item_results=True) + + Examples: + Basic usage with aggregate results only: + ```python + result = langfuse.run_experiment(...) + print(format_experiment_result(result)) + ``` + + Detailed output including individual items: + ```python + result = langfuse.run_experiment(...) + detailed_report = format_experiment_result( + result, + include_item_results=True + ) + print(detailed_report) + ``` + + Save formatted results to file: + ```python + result = dataset.run_experiment(...) + with open("experiment_report.txt", "w") as f: + f.write(format_experiment_result(result, include_item_results=True)) + ``` """ + item_results = experiment_result["item_results"] + run_evaluations = experiment_result["run_evaluations"] + dataset_run_url = experiment_result["dataset_run_url"] + if not item_results: return "No experiment results to display." @@ -651,9 +688,9 @@ def format_experiment_results( # Experiment Overview output += f"\n{'โ”€' * 50}\n" - output += f"๐Ÿ“Š {experiment_name}" - if experiment_description: - output += f" - {experiment_description}" + output += f"๐Ÿ“Š {experiment_result['name']}" + if experiment_result["description"]: + output += f" - {experiment_result['description']}" output += f"\n{len(item_results)} items" diff --git a/langfuse/experiment.py b/langfuse/experiment.py index 2d54255e2..8bc953e82 100644 --- a/langfuse/experiment.py +++ b/langfuse/experiment.py @@ -9,6 +9,7 @@ RunEvaluatorFunction, TaskFunction, create_evaluator_from_autoevals, + format_experiment_result, ) __all__ = [ @@ -22,4 +23,5 @@ "EvaluatorFunction", "RunEvaluatorFunction", "create_evaluator_from_autoevals", + "format_experiment_result", ] From 7c583fe7cac570b1eda3378ae9bd3b1f05ec3c02 Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Mon, 15 Sep 2025 13:52:49 +0200 Subject: [PATCH 13/25] push --- langfuse/_client/client.py | 22 +- langfuse/_client/datasets.py | 2 +- langfuse/_client/experiments.py | 818 ------------------------------- langfuse/experiment.py | 841 +++++++++++++++++++++++++++++++- tests/test_core_sdk.py | 2 +- 5 files changed, 829 insertions(+), 856 deletions(-) delete mode 100644 langfuse/_client/experiments.py diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py index 45bc773e8..514e00084 100644 --- a/langfuse/_client/client.py +++ b/langfuse/_client/client.py @@ -56,17 +56,6 @@ LANGFUSE_TRACING_ENABLED, LANGFUSE_TRACING_ENVIRONMENT, ) -from langfuse._client.experiments import ( - EvaluatorFunction, - ExperimentData, - ExperimentItem, - ExperimentItemResult, - ExperimentResult, - RunEvaluatorFunction, - TaskFunction, - _run_evaluator, - _run_task, -) from langfuse._client.resource_manager import LangfuseResourceManager from langfuse._client.span import ( LangfuseAgent, @@ -92,6 +81,17 @@ Prompt_Chat, Prompt_Text, ) +from langfuse.experiment import ( + EvaluatorFunction, + ExperimentData, + ExperimentItem, + ExperimentItemResult, + ExperimentResult, + RunEvaluatorFunction, + TaskFunction, + _run_evaluator, + _run_task, +) from langfuse.logger import langfuse_logger from langfuse.media import LangfuseMedia from langfuse.model import ( diff --git a/langfuse/_client/datasets.py b/langfuse/_client/datasets.py index 023b7f947..f62c8b0f1 100644 --- a/langfuse/_client/datasets.py +++ b/langfuse/_client/datasets.py @@ -4,7 +4,7 @@ from opentelemetry.util._decorator import _agnosticcontextmanager -from langfuse._client.experiments import ( +from langfuse.experiment import ( EvaluatorFunction, RunEvaluatorFunction, TaskFunction, diff --git a/langfuse/_client/experiments.py b/langfuse/_client/experiments.py deleted file mode 100644 index 249c71d2d..000000000 --- a/langfuse/_client/experiments.py +++ /dev/null @@ -1,818 +0,0 @@ -"""Langfuse experiment functionality for running and evaluating tasks on datasets. - -This module provides the core experiment functionality for the Langfuse Python SDK, -allowing users to run experiments on datasets with automatic tracing, evaluation, -and result formatting. -""" - -import asyncio -import logging -from typing import ( - TYPE_CHECKING, - Any, - Awaitable, - Dict, - List, - Optional, - Protocol, - TypedDict, - Union, -) - -if TYPE_CHECKING: - from langfuse._client.datasets import DatasetItemClient - - -class LocalExperimentItem(TypedDict, total=False): - """Structure for local experiment data items (not from Langfuse datasets). - - This TypedDict defines the structure for experiment items when using local data - rather than Langfuse-hosted datasets. All fields are optional to provide - flexibility in data structure. - - Attributes: - input: The input data to pass to the task function. Can be any type that - your task function can process (string, dict, list, etc.). This is - typically the prompt, question, or data that your task will operate on. - expected_output: Optional expected/ground truth output for evaluation purposes. - Used by evaluators to assess correctness or quality. Can be None if - no ground truth is available. - metadata: Optional metadata dictionary containing additional context about - this specific item. Can include information like difficulty level, - category, source, or any other relevant attributes that evaluators - might use for context-aware evaluation. - - Examples: - Simple text processing item: - ```python - item: LocalExperimentItem = { - "input": "Summarize this article: ...", - "expected_output": "Expected summary...", - "metadata": {"difficulty": "medium", "category": "news"} - } - ``` - - Classification item: - ```python - item: LocalExperimentItem = { - "input": {"text": "This movie is great!", "context": "movie review"}, - "expected_output": "positive", - "metadata": {"dataset_source": "imdb", "confidence": 0.95} - } - ``` - - Minimal item with only input: - ```python - item: LocalExperimentItem = { - "input": "What is the capital of France?" - } - ``` - """ - - input: Any - expected_output: Any - metadata: Optional[Dict[str, Any]] - - -ExperimentItem = Union[LocalExperimentItem, "DatasetItemClient"] -"""Type alias for items that can be processed in experiments. - -Can be either: -- LocalExperimentItem: Dict-like items with 'input', 'expected_output', 'metadata' keys -- DatasetItemClient: Items from Langfuse datasets with .input, .expected_output, .metadata attributes -""" - -ExperimentData = Union[List[LocalExperimentItem], List["DatasetItemClient"]] -"""Type alias for experiment datasets. - -Represents the collection of items to process in an experiment. Can be either: -- List[LocalExperimentItem]: Local data items as dictionaries -- List[DatasetItemClient]: Items from a Langfuse dataset (typically from dataset.items) -""" - - -class Evaluation(TypedDict, total=False): - """Structure for evaluation results returned by evaluator functions. - - This TypedDict defines the standardized format that all evaluator functions - must return. It provides a consistent structure for storing evaluation metrics - and their metadata across different types of evaluators. - - Attributes: - name: Unique identifier for the evaluation metric. Should be descriptive - and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). - Used for aggregation and comparison across experiment runs. - value: The evaluation score or result. Can be: - - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42) - - String: For categorical results like "positive", "negative", "neutral" - - Boolean: For binary assessments like "passes_safety_check" - - None: When evaluation cannot be computed (missing data, API errors, etc.) - comment: Optional human-readable explanation of the evaluation result. - Useful for providing context, explaining scoring rationale, or noting - special conditions. Displayed in Langfuse UI for interpretability. - metadata: Optional structured metadata about the evaluation process. - Can include confidence scores, intermediate calculations, model versions, - or any other relevant technical details. - - Examples: - Quantitative accuracy evaluation: - ```python - accuracy_result: Evaluation = { - "name": "accuracy", - "value": 0.85, - "comment": "85% of responses were correct", - "metadata": {"total_items": 100, "correct_items": 85} - } - ``` - - Qualitative assessment: - ```python - sentiment_result: Evaluation = { - "name": "sentiment", - "value": "positive", - "comment": "Response expresses optimistic viewpoint", - "metadata": {"confidence": 0.92, "model": "sentiment-analyzer-v2"} - } - ``` - - Binary check: - ```python - safety_result: Evaluation = { - "name": "safety_check", - "value": True, - "comment": "Content passes all safety filters" - } - ``` - - Failed evaluation: - ```python - failed_result: Evaluation = { - "name": "external_api_score", - "value": None, - "comment": "External API unavailable", - "metadata": {"error": "timeout", "retry_count": 3} - } - ``` - """ - - name: str - value: Union[int, float, str, bool, None] - comment: Optional[str] - metadata: Optional[Dict[str, Any]] - - -class ExperimentItemResult(TypedDict): - """Result structure for individual experiment items. - - Args: - item: The original experiment item that was processed - output: The actual output produced by the task - evaluations: List of evaluation results for this item - trace_id: Langfuse trace ID for this item's execution - dataset_run_id: Dataset run ID if this item was part of a Langfuse dataset - """ - - item: ExperimentItem - output: Any - evaluations: List[Evaluation] - trace_id: Optional[str] - dataset_run_id: Optional[str] - - -class ExperimentResult(TypedDict): - """Complete result structure for experiment execution. - - Args: - item_results: Results from processing each individual data item - run_evaluations: Results from run-level evaluators - dataset_run_id: ID of the dataset run (if using Langfuse datasets) - dataset_run_url: URL to view the dataset run in Langfuse UI - """ - - name: str - description: Optional[str] - item_results: List[ExperimentItemResult] - run_evaluations: List[Evaluation] - dataset_run_id: Optional[str] - dataset_run_url: Optional[str] - - -class TaskFunction(Protocol): - """Protocol defining the interface for experiment task functions. - - Task functions are the core processing functions that operate on each item - in an experiment dataset. They receive an experiment item as input and - produce some output that will be evaluated. - - Task functions must: - - Accept 'item' as a keyword argument - - Return any type of output (will be passed to evaluators) - - Can be either synchronous or asynchronous - - Should handle their own errors gracefully (exceptions will be logged) - """ - - def __call__( - self, - *, - item: ExperimentItem, - **kwargs: Dict[str, Any], - ) -> Union[Any, Awaitable[Any]]: - """Execute the task on an experiment item. - - This method defines the core processing logic for each item in your experiment. - The implementation should focus on the specific task you want to evaluate, - such as text generation, classification, summarization, etc. - - Args: - item: The experiment item to process. Can be either: - - Dict with keys like 'input', 'expected_output', 'metadata' - - Langfuse DatasetItem object with .input, .expected_output attributes - **kwargs: Additional keyword arguments that may be passed by the framework - - Returns: - Any: The output of processing the item. This output will be: - - Stored in the experiment results - - Passed to all item-level evaluators for assessment - - Traced automatically in Langfuse for observability - - Can return either a direct value or an awaitable (async) result. - - Examples: - Simple synchronous task: - ```python - def my_task(*, item, **kwargs): - prompt = f"Summarize: {item['input']}" - return my_llm_client.generate(prompt) - ``` - - Async task with error handling: - ```python - async def my_async_task(*, item, **kwargs): - try: - response = await openai_client.chat.completions.create( - model="gpt-4", - messages=[{"role": "user", "content": item["input"]}] - ) - return response.choices[0].message.content - except Exception as e: - # Log error and return fallback - print(f"Task failed for item {item}: {e}") - return "Error: Could not process item" - ``` - - Task using dataset item attributes: - ```python - def classification_task(*, item, **kwargs): - # Works with both dict items and DatasetItem objects - text = item["input"] if isinstance(item, dict) else item.input - return classify_text(text) - ``` - """ - ... - - -class EvaluatorFunction(Protocol): - """Protocol defining the interface for item-level evaluator functions. - - Item-level evaluators assess the quality, correctness, or other properties - of individual task outputs. They receive the input, output, expected output, - and metadata for each item and return evaluation metrics. - - Evaluators should: - - Accept input, output, expected_output, and metadata as keyword arguments - - Return Evaluation dict(s) with 'name', 'value', 'comment', 'metadata' fields - - Be deterministic when possible for reproducible results - - Handle edge cases gracefully (missing expected output, malformed data, etc.) - - Can be either synchronous or asynchronous - """ - - def __call__( - self, - *, - input: Any, - output: Any, - expected_output: Any, - metadata: Optional[Dict[str, Any]], - **kwargs: Dict[str, Any], - ) -> Union[ - Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]] - ]: - """Evaluate a task output for quality, correctness, or other metrics. - - This method should implement specific evaluation logic such as accuracy checking, - similarity measurement, toxicity detection, fluency assessment, etc. - - Args: - input: The original input that was passed to the task function. - This is typically the item['input'] or item.input value. - output: The output produced by the task function for this input. - This is the direct return value from your task function. - expected_output: The expected/ground truth output for comparison. - May be None if not available in the dataset. Evaluators should - handle this case appropriately. - metadata: Optional metadata from the experiment item that might - contain additional context for evaluation (categories, difficulty, etc.) - **kwargs: Additional keyword arguments that may be passed by the framework - - Returns: - Evaluation results in one of these formats: - - Single Evaluation dict: {"name": "accuracy", "value": 0.85, "comment": "..."} - - List of Evaluation dicts: [{"name": "precision", ...}, {"name": "recall", ...}] - - Awaitable returning either of the above (for async evaluators) - - Each Evaluation dict should contain: - - name (str): Unique identifier for this evaluation metric - - value (int|float|str|bool): The evaluation score or result - - comment (str, optional): Human-readable explanation of the result - - metadata (dict, optional): Additional structured data about the evaluation - - Examples: - Simple accuracy evaluator: - ```python - def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): - if expected_output is None: - return {"name": "accuracy", "value": None, "comment": "No expected output"} - - is_correct = output.strip().lower() == expected_output.strip().lower() - return { - "name": "accuracy", - "value": 1.0 if is_correct else 0.0, - "comment": "Exact match" if is_correct else "No match" - } - ``` - - Multi-metric evaluator: - ```python - def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs): - results = [] - - # Length check - results.append({ - "name": "output_length", - "value": len(output), - "comment": f"Output contains {len(output)} characters" - }) - - # Sentiment analysis - sentiment_score = analyze_sentiment(output) - results.append({ - "name": "sentiment", - "value": sentiment_score, - "comment": f"Sentiment score: {sentiment_score:.2f}" - }) - - return results - ``` - - Async evaluator using external API: - ```python - async def llm_judge_evaluator(*, input, output, expected_output=None, **kwargs): - prompt = f"Rate the quality of this response on a scale of 1-10:\n" - prompt += f"Question: {input}\nResponse: {output}" - - response = await openai_client.chat.completions.create( - model="gpt-4", - messages=[{"role": "user", "content": prompt}] - ) - - try: - score = float(response.choices[0].message.content.strip()) - return { - "name": "llm_judge_quality", - "value": score, - "comment": f"LLM judge rated this {score}/10" - } - except ValueError: - return { - "name": "llm_judge_quality", - "value": None, - "comment": "Could not parse LLM judge score" - } - ``` - - Context-aware evaluator: - ```python - def context_evaluator(*, input, output, metadata=None, **kwargs): - # Use metadata for context-specific evaluation - difficulty = metadata.get("difficulty", "medium") if metadata else "medium" - - # Adjust expectations based on difficulty - min_length = {"easy": 50, "medium": 100, "hard": 150}[difficulty] - - meets_requirement = len(output) >= min_length - return { - "name": f"meets_{difficulty}_requirement", - "value": meets_requirement, - "comment": f"Output {'meets' if meets_requirement else 'fails'} {difficulty} length requirement" - } - ``` - """ - ... - - -class RunEvaluatorFunction(Protocol): - """Protocol defining the interface for run-level evaluator functions. - - Run-level evaluators assess aggregate properties of the entire experiment run, - computing metrics that span across all items rather than individual outputs. - They receive the complete results from all processed items and can compute - statistics like averages, distributions, correlations, or other aggregate metrics. - - Run evaluators should: - - Accept item_results as a keyword argument containing all item results - - Return Evaluation dict(s) with aggregate metrics - - Handle cases where some items may have failed processing - - Compute meaningful statistics across the dataset - - Can be either synchronous or asynchronous - """ - - def __call__( - self, - *, - item_results: List[ExperimentItemResult], - **kwargs: Dict[str, Any], - ) -> Union[ - Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]] - ]: - """Evaluate the entire experiment run with aggregate metrics. - - This method should implement aggregate evaluation logic such as computing - averages, calculating distributions, finding correlations, detecting patterns - across items, or performing statistical analysis on the experiment results. - - Args: - item_results: List of results from all successfully processed experiment items. - Each item result contains: - - item: The original experiment item - - output: The task function's output for this item - - evaluations: List of item-level evaluation results - - trace_id: Langfuse trace ID for this execution - - dataset_run_id: Dataset run ID (if using Langfuse datasets) - - Note: This list only includes items that were successfully processed. - Failed items are excluded but logged separately. - **kwargs: Additional keyword arguments that may be passed by the framework - - Returns: - Evaluation results in one of these formats: - - Single Evaluation dict: {"name": "avg_accuracy", "value": 0.78, "comment": "..."} - - List of Evaluation dicts: [{"name": "mean", ...}, {"name": "std_dev", ...}] - - Awaitable returning either of the above (for async evaluators) - - Each Evaluation dict should contain: - - name (str): Unique identifier for this run-level metric - - value (int|float|str|bool): The aggregate evaluation result - - comment (str, optional): Human-readable explanation of the metric - - metadata (dict, optional): Additional structured data about the evaluation - - Examples: - Average accuracy calculator: - ```python - def average_accuracy(*, item_results, **kwargs): - if not item_results: - return {"name": "avg_accuracy", "value": 0.0, "comment": "No results"} - - accuracy_values = [] - for result in item_results: - for evaluation in result["evaluations"]: - if evaluation["name"] == "accuracy": - accuracy_values.append(evaluation["value"]) - - if not accuracy_values: - return {"name": "avg_accuracy", "value": None, "comment": "No accuracy evaluations found"} - - avg = sum(accuracy_values) / len(accuracy_values) - return { - "name": "avg_accuracy", - "value": avg, - "comment": f"Average accuracy across {len(accuracy_values)} items: {avg:.2%}" - } - ``` - - Multiple aggregate metrics: - ```python - def statistical_summary(*, item_results, **kwargs): - if not item_results: - return [] - - results = [] - - # Calculate output length statistics - lengths = [len(str(result["output"])) for result in item_results] - results.extend([ - {"name": "avg_output_length", "value": sum(lengths) / len(lengths)}, - {"name": "min_output_length", "value": min(lengths)}, - {"name": "max_output_length", "value": max(lengths)} - ]) - - # Success rate - total_items = len(item_results) # Only successful items are included - results.append({ - "name": "processing_success_rate", - "value": 1.0, # All items in item_results succeeded - "comment": f"Successfully processed {total_items} items" - }) - - return results - ``` - - Async run evaluator with external analysis: - ```python - async def llm_batch_analysis(*, item_results, **kwargs): - # Prepare batch analysis prompt - outputs = [result["output"] for result in item_results] - prompt = f"Analyze these {len(outputs)} outputs for common themes:\n" - prompt += "\n".join(f"{i+1}. {output}" for i, output in enumerate(outputs)) - - response = await openai_client.chat.completions.create( - model="gpt-4", - messages=[{"role": "user", "content": prompt}] - ) - - return { - "name": "thematic_analysis", - "value": response.choices[0].message.content, - "comment": f"LLM analysis of {len(outputs)} outputs" - } - ``` - - Performance distribution analysis: - ```python - def performance_distribution(*, item_results, **kwargs): - # Extract all evaluation scores - all_scores = [] - score_by_metric = {} - - for result in item_results: - for evaluation in result["evaluations"]: - metric_name = evaluation["name"] - value = evaluation["value"] - - if isinstance(value, (int, float)): - all_scores.append(value) - if metric_name not in score_by_metric: - score_by_metric[metric_name] = [] - score_by_metric[metric_name].append(value) - - results = [] - - # Overall score distribution - if all_scores: - import statistics - results.append({ - "name": "score_std_dev", - "value": statistics.stdev(all_scores) if len(all_scores) > 1 else 0, - "comment": f"Standard deviation across all numeric scores" - }) - - # Per-metric statistics - for metric, scores in score_by_metric.items(): - if len(scores) > 1: - results.append({ - "name": f"{metric}_variance", - "value": statistics.variance(scores), - "comment": f"Variance in {metric} across {len(scores)} items" - }) - - return results - ``` - """ - ... - - -def format_experiment_result( - experiment_result: ExperimentResult, - *, - include_item_results: bool = False, -) -> str: - """Format an experiment result for human-readable display. - - Takes an ExperimentResult object and converts it into a nicely formatted - string suitable for console output or logging. The output includes experiment - overview, aggregate statistics, and optionally individual item details. - - Args: - experiment_result: Complete experiment result containing name, description, - item results, run evaluations, and dataset run information. - include_item_results: Whether to include detailed results for each individual - item in the output. When False (default), only shows aggregate statistics. - Set to True to see input/output/scores for every processed item. - - Returns: - A formatted multi-line string containing: - - Experiment name and description - - Number of items processed - - List of evaluation metrics used - - Average scores across all items - - Run-level evaluation results - - Dataset run URL (if available) - - Individual item details (if include_item_results=True) - - Examples: - Basic usage with aggregate results only: - ```python - result = langfuse.run_experiment(...) - print(format_experiment_result(result)) - ``` - - Detailed output including individual items: - ```python - result = langfuse.run_experiment(...) - detailed_report = format_experiment_result( - result, - include_item_results=True - ) - print(detailed_report) - ``` - - Save formatted results to file: - ```python - result = dataset.run_experiment(...) - with open("experiment_report.txt", "w") as f: - f.write(format_experiment_result(result, include_item_results=True)) - ``` - """ - item_results = experiment_result["item_results"] - run_evaluations = experiment_result["run_evaluations"] - dataset_run_url = experiment_result["dataset_run_url"] - - if not item_results: - return "No experiment results to display." - - output = "" - - # Individual results - if include_item_results: - for i, result in enumerate(item_results): - output += f"\n{i + 1}. Item {i + 1}:\n" - - # Input, expected, and actual - item_input = None - if isinstance(result["item"], dict): - item_input = result["item"].get("input") - elif hasattr(result["item"], "input"): - item_input = result["item"].input - - if item_input is not None: - output += f" Input: {_format_value(item_input)}\n" - - expected_output = None - if isinstance(result["item"], dict): - expected_output = result["item"].get("expected_output") - elif hasattr(result["item"], "expected_output"): - expected_output = result["item"].expected_output - - if expected_output is not None: - output += f" Expected: {_format_value(expected_output)}\n" - output += f" Actual: {_format_value(result['output'])}\n" - - # Scores - if result["evaluations"]: - output += " Scores:\n" - for evaluation in result["evaluations"]: - score = evaluation["value"] - if isinstance(score, (int, float)): - score = f"{score:.3f}" - output += f" โ€ข {evaluation['name']}: {score}" - if evaluation.get("comment"): - output += f"\n ๐Ÿ’ญ {evaluation['comment']}" - output += "\n" - - # Trace link - if result.get("trace_id"): - # Note: We'd need the langfuse client to generate the actual URL - output += f"\n Trace ID: {result['trace_id']}\n" - else: - output += f"Individual Results: Hidden ({len(item_results)} items)\n" - output += "๐Ÿ’ก Set include_item_results=True to view them\n" - - # Experiment Overview - output += f"\n{'โ”€' * 50}\n" - output += f"๐Ÿ“Š {experiment_result['name']}" - if experiment_result["description"]: - output += f" - {experiment_result['description']}" - - output += f"\n{len(item_results)} items" - - # Get unique evaluation names - evaluation_names = set() - for result in item_results: - for evaluation in result["evaluations"]: - evaluation_names.add(evaluation["name"]) - - if evaluation_names: - output += "\nEvaluations:" - for eval_name in evaluation_names: - output += f"\n โ€ข {eval_name}" - output += "\n" - - # Average scores - if evaluation_names: - output += "\nAverage Scores:" - for eval_name in evaluation_names: - scores = [] - for result in item_results: - for evaluation in result["evaluations"]: - if evaluation["name"] == eval_name and isinstance( - evaluation["value"], (int, float) - ): - scores.append(evaluation["value"]) - - if scores: - avg = sum(scores) / len(scores) - output += f"\n โ€ข {eval_name}: {avg:.3f}" - output += "\n" - - # Run evaluations - if run_evaluations: - output += "\nRun Evaluations:" - for run_eval in run_evaluations: - score = run_eval["value"] - if isinstance(score, (int, float)): - score = f"{score:.3f}" - output += f"\n โ€ข {run_eval['name']}: {score}" - if run_eval.get("comment"): - output += f"\n ๐Ÿ’ญ {run_eval['comment']}" - output += "\n" - - if dataset_run_url: - output += f"\n๐Ÿ”— Dataset Run:\n {dataset_run_url}" - - return output - - -def _format_value(value: Any) -> str: - """Format a value for display.""" - if isinstance(value, str): - return value[:50] + "..." if len(value) > 50 else value - return str(value) - - -async def _run_evaluator( - evaluator: Union[EvaluatorFunction, RunEvaluatorFunction], **kwargs: Any -) -> List[Evaluation]: - """Run an evaluator function and normalize the result.""" - try: - result = evaluator(**kwargs) - - # Handle async evaluators - if asyncio.iscoroutine(result): - result = await result - - # Normalize to list - if isinstance(result, dict): - return [result] - - elif isinstance(result, list): - return result - - else: - return [] - - except Exception as e: - evaluator_name = getattr(evaluator, "__name__", "unknown_evaluator") - logging.getLogger("langfuse").error(f"Evaluator {evaluator_name} failed: {e}") - return [] - - -async def _run_task(task: TaskFunction, item: ExperimentItem) -> Any: - """Run a task function and handle sync/async.""" - result = task(item=item) - - # Handle async tasks - if asyncio.iscoroutine(result): - result = await result - - return result - - -def create_evaluator_from_autoevals( - autoevals_evaluator: Any, **kwargs: Optional[Dict[str, Any]] -) -> EvaluatorFunction: - """Create a Langfuse evaluator from an autoevals evaluator. - - Args: - autoevals_evaluator: An autoevals evaluator instance - **kwargs: Additional arguments passed to the evaluator - - Returns: - A Langfuse-compatible evaluator function - """ - - def langfuse_evaluator( - *, - input: Any, - output: Any, - expected_output: Any, - metadata: Optional[Dict[str, Any]], - **kwargs: Dict[str, Any], - ) -> Evaluation: - evaluation = autoevals_evaluator( - input=input, output=output, expected=expected_output, **kwargs - ) - - return Evaluation( - name=evaluation.name, value=evaluation.score, metadata=evaluation.metadata - ) - - return langfuse_evaluator diff --git a/langfuse/experiment.py b/langfuse/experiment.py index 8bc953e82..249c71d2d 100644 --- a/langfuse/experiment.py +++ b/langfuse/experiment.py @@ -1,27 +1,818 @@ -from ._client.experiments import ( - Evaluation, - EvaluatorFunction, - ExperimentData, - ExperimentItem, - ExperimentItemResult, - ExperimentResult, - LocalExperimentItem, - RunEvaluatorFunction, - TaskFunction, - create_evaluator_from_autoevals, - format_experiment_result, +"""Langfuse experiment functionality for running and evaluating tasks on datasets. + +This module provides the core experiment functionality for the Langfuse Python SDK, +allowing users to run experiments on datasets with automatic tracing, evaluation, +and result formatting. +""" + +import asyncio +import logging +from typing import ( + TYPE_CHECKING, + Any, + Awaitable, + Dict, + List, + Optional, + Protocol, + TypedDict, + Union, ) -__all__ = [ - "LocalExperimentItem", - "ExperimentItem", - "ExperimentData", - "Evaluation", - "ExperimentItemResult", - "ExperimentResult", - "TaskFunction", - "EvaluatorFunction", - "RunEvaluatorFunction", - "create_evaluator_from_autoevals", - "format_experiment_result", -] +if TYPE_CHECKING: + from langfuse._client.datasets import DatasetItemClient + + +class LocalExperimentItem(TypedDict, total=False): + """Structure for local experiment data items (not from Langfuse datasets). + + This TypedDict defines the structure for experiment items when using local data + rather than Langfuse-hosted datasets. All fields are optional to provide + flexibility in data structure. + + Attributes: + input: The input data to pass to the task function. Can be any type that + your task function can process (string, dict, list, etc.). This is + typically the prompt, question, or data that your task will operate on. + expected_output: Optional expected/ground truth output for evaluation purposes. + Used by evaluators to assess correctness or quality. Can be None if + no ground truth is available. + metadata: Optional metadata dictionary containing additional context about + this specific item. Can include information like difficulty level, + category, source, or any other relevant attributes that evaluators + might use for context-aware evaluation. + + Examples: + Simple text processing item: + ```python + item: LocalExperimentItem = { + "input": "Summarize this article: ...", + "expected_output": "Expected summary...", + "metadata": {"difficulty": "medium", "category": "news"} + } + ``` + + Classification item: + ```python + item: LocalExperimentItem = { + "input": {"text": "This movie is great!", "context": "movie review"}, + "expected_output": "positive", + "metadata": {"dataset_source": "imdb", "confidence": 0.95} + } + ``` + + Minimal item with only input: + ```python + item: LocalExperimentItem = { + "input": "What is the capital of France?" + } + ``` + """ + + input: Any + expected_output: Any + metadata: Optional[Dict[str, Any]] + + +ExperimentItem = Union[LocalExperimentItem, "DatasetItemClient"] +"""Type alias for items that can be processed in experiments. + +Can be either: +- LocalExperimentItem: Dict-like items with 'input', 'expected_output', 'metadata' keys +- DatasetItemClient: Items from Langfuse datasets with .input, .expected_output, .metadata attributes +""" + +ExperimentData = Union[List[LocalExperimentItem], List["DatasetItemClient"]] +"""Type alias for experiment datasets. + +Represents the collection of items to process in an experiment. Can be either: +- List[LocalExperimentItem]: Local data items as dictionaries +- List[DatasetItemClient]: Items from a Langfuse dataset (typically from dataset.items) +""" + + +class Evaluation(TypedDict, total=False): + """Structure for evaluation results returned by evaluator functions. + + This TypedDict defines the standardized format that all evaluator functions + must return. It provides a consistent structure for storing evaluation metrics + and their metadata across different types of evaluators. + + Attributes: + name: Unique identifier for the evaluation metric. Should be descriptive + and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). + Used for aggregation and comparison across experiment runs. + value: The evaluation score or result. Can be: + - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42) + - String: For categorical results like "positive", "negative", "neutral" + - Boolean: For binary assessments like "passes_safety_check" + - None: When evaluation cannot be computed (missing data, API errors, etc.) + comment: Optional human-readable explanation of the evaluation result. + Useful for providing context, explaining scoring rationale, or noting + special conditions. Displayed in Langfuse UI for interpretability. + metadata: Optional structured metadata about the evaluation process. + Can include confidence scores, intermediate calculations, model versions, + or any other relevant technical details. + + Examples: + Quantitative accuracy evaluation: + ```python + accuracy_result: Evaluation = { + "name": "accuracy", + "value": 0.85, + "comment": "85% of responses were correct", + "metadata": {"total_items": 100, "correct_items": 85} + } + ``` + + Qualitative assessment: + ```python + sentiment_result: Evaluation = { + "name": "sentiment", + "value": "positive", + "comment": "Response expresses optimistic viewpoint", + "metadata": {"confidence": 0.92, "model": "sentiment-analyzer-v2"} + } + ``` + + Binary check: + ```python + safety_result: Evaluation = { + "name": "safety_check", + "value": True, + "comment": "Content passes all safety filters" + } + ``` + + Failed evaluation: + ```python + failed_result: Evaluation = { + "name": "external_api_score", + "value": None, + "comment": "External API unavailable", + "metadata": {"error": "timeout", "retry_count": 3} + } + ``` + """ + + name: str + value: Union[int, float, str, bool, None] + comment: Optional[str] + metadata: Optional[Dict[str, Any]] + + +class ExperimentItemResult(TypedDict): + """Result structure for individual experiment items. + + Args: + item: The original experiment item that was processed + output: The actual output produced by the task + evaluations: List of evaluation results for this item + trace_id: Langfuse trace ID for this item's execution + dataset_run_id: Dataset run ID if this item was part of a Langfuse dataset + """ + + item: ExperimentItem + output: Any + evaluations: List[Evaluation] + trace_id: Optional[str] + dataset_run_id: Optional[str] + + +class ExperimentResult(TypedDict): + """Complete result structure for experiment execution. + + Args: + item_results: Results from processing each individual data item + run_evaluations: Results from run-level evaluators + dataset_run_id: ID of the dataset run (if using Langfuse datasets) + dataset_run_url: URL to view the dataset run in Langfuse UI + """ + + name: str + description: Optional[str] + item_results: List[ExperimentItemResult] + run_evaluations: List[Evaluation] + dataset_run_id: Optional[str] + dataset_run_url: Optional[str] + + +class TaskFunction(Protocol): + """Protocol defining the interface for experiment task functions. + + Task functions are the core processing functions that operate on each item + in an experiment dataset. They receive an experiment item as input and + produce some output that will be evaluated. + + Task functions must: + - Accept 'item' as a keyword argument + - Return any type of output (will be passed to evaluators) + - Can be either synchronous or asynchronous + - Should handle their own errors gracefully (exceptions will be logged) + """ + + def __call__( + self, + *, + item: ExperimentItem, + **kwargs: Dict[str, Any], + ) -> Union[Any, Awaitable[Any]]: + """Execute the task on an experiment item. + + This method defines the core processing logic for each item in your experiment. + The implementation should focus on the specific task you want to evaluate, + such as text generation, classification, summarization, etc. + + Args: + item: The experiment item to process. Can be either: + - Dict with keys like 'input', 'expected_output', 'metadata' + - Langfuse DatasetItem object with .input, .expected_output attributes + **kwargs: Additional keyword arguments that may be passed by the framework + + Returns: + Any: The output of processing the item. This output will be: + - Stored in the experiment results + - Passed to all item-level evaluators for assessment + - Traced automatically in Langfuse for observability + + Can return either a direct value or an awaitable (async) result. + + Examples: + Simple synchronous task: + ```python + def my_task(*, item, **kwargs): + prompt = f"Summarize: {item['input']}" + return my_llm_client.generate(prompt) + ``` + + Async task with error handling: + ```python + async def my_async_task(*, item, **kwargs): + try: + response = await openai_client.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": item["input"]}] + ) + return response.choices[0].message.content + except Exception as e: + # Log error and return fallback + print(f"Task failed for item {item}: {e}") + return "Error: Could not process item" + ``` + + Task using dataset item attributes: + ```python + def classification_task(*, item, **kwargs): + # Works with both dict items and DatasetItem objects + text = item["input"] if isinstance(item, dict) else item.input + return classify_text(text) + ``` + """ + ... + + +class EvaluatorFunction(Protocol): + """Protocol defining the interface for item-level evaluator functions. + + Item-level evaluators assess the quality, correctness, or other properties + of individual task outputs. They receive the input, output, expected output, + and metadata for each item and return evaluation metrics. + + Evaluators should: + - Accept input, output, expected_output, and metadata as keyword arguments + - Return Evaluation dict(s) with 'name', 'value', 'comment', 'metadata' fields + - Be deterministic when possible for reproducible results + - Handle edge cases gracefully (missing expected output, malformed data, etc.) + - Can be either synchronous or asynchronous + """ + + def __call__( + self, + *, + input: Any, + output: Any, + expected_output: Any, + metadata: Optional[Dict[str, Any]], + **kwargs: Dict[str, Any], + ) -> Union[ + Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]] + ]: + """Evaluate a task output for quality, correctness, or other metrics. + + This method should implement specific evaluation logic such as accuracy checking, + similarity measurement, toxicity detection, fluency assessment, etc. + + Args: + input: The original input that was passed to the task function. + This is typically the item['input'] or item.input value. + output: The output produced by the task function for this input. + This is the direct return value from your task function. + expected_output: The expected/ground truth output for comparison. + May be None if not available in the dataset. Evaluators should + handle this case appropriately. + metadata: Optional metadata from the experiment item that might + contain additional context for evaluation (categories, difficulty, etc.) + **kwargs: Additional keyword arguments that may be passed by the framework + + Returns: + Evaluation results in one of these formats: + - Single Evaluation dict: {"name": "accuracy", "value": 0.85, "comment": "..."} + - List of Evaluation dicts: [{"name": "precision", ...}, {"name": "recall", ...}] + - Awaitable returning either of the above (for async evaluators) + + Each Evaluation dict should contain: + - name (str): Unique identifier for this evaluation metric + - value (int|float|str|bool): The evaluation score or result + - comment (str, optional): Human-readable explanation of the result + - metadata (dict, optional): Additional structured data about the evaluation + + Examples: + Simple accuracy evaluator: + ```python + def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): + if expected_output is None: + return {"name": "accuracy", "value": None, "comment": "No expected output"} + + is_correct = output.strip().lower() == expected_output.strip().lower() + return { + "name": "accuracy", + "value": 1.0 if is_correct else 0.0, + "comment": "Exact match" if is_correct else "No match" + } + ``` + + Multi-metric evaluator: + ```python + def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs): + results = [] + + # Length check + results.append({ + "name": "output_length", + "value": len(output), + "comment": f"Output contains {len(output)} characters" + }) + + # Sentiment analysis + sentiment_score = analyze_sentiment(output) + results.append({ + "name": "sentiment", + "value": sentiment_score, + "comment": f"Sentiment score: {sentiment_score:.2f}" + }) + + return results + ``` + + Async evaluator using external API: + ```python + async def llm_judge_evaluator(*, input, output, expected_output=None, **kwargs): + prompt = f"Rate the quality of this response on a scale of 1-10:\n" + prompt += f"Question: {input}\nResponse: {output}" + + response = await openai_client.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": prompt}] + ) + + try: + score = float(response.choices[0].message.content.strip()) + return { + "name": "llm_judge_quality", + "value": score, + "comment": f"LLM judge rated this {score}/10" + } + except ValueError: + return { + "name": "llm_judge_quality", + "value": None, + "comment": "Could not parse LLM judge score" + } + ``` + + Context-aware evaluator: + ```python + def context_evaluator(*, input, output, metadata=None, **kwargs): + # Use metadata for context-specific evaluation + difficulty = metadata.get("difficulty", "medium") if metadata else "medium" + + # Adjust expectations based on difficulty + min_length = {"easy": 50, "medium": 100, "hard": 150}[difficulty] + + meets_requirement = len(output) >= min_length + return { + "name": f"meets_{difficulty}_requirement", + "value": meets_requirement, + "comment": f"Output {'meets' if meets_requirement else 'fails'} {difficulty} length requirement" + } + ``` + """ + ... + + +class RunEvaluatorFunction(Protocol): + """Protocol defining the interface for run-level evaluator functions. + + Run-level evaluators assess aggregate properties of the entire experiment run, + computing metrics that span across all items rather than individual outputs. + They receive the complete results from all processed items and can compute + statistics like averages, distributions, correlations, or other aggregate metrics. + + Run evaluators should: + - Accept item_results as a keyword argument containing all item results + - Return Evaluation dict(s) with aggregate metrics + - Handle cases where some items may have failed processing + - Compute meaningful statistics across the dataset + - Can be either synchronous or asynchronous + """ + + def __call__( + self, + *, + item_results: List[ExperimentItemResult], + **kwargs: Dict[str, Any], + ) -> Union[ + Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]] + ]: + """Evaluate the entire experiment run with aggregate metrics. + + This method should implement aggregate evaluation logic such as computing + averages, calculating distributions, finding correlations, detecting patterns + across items, or performing statistical analysis on the experiment results. + + Args: + item_results: List of results from all successfully processed experiment items. + Each item result contains: + - item: The original experiment item + - output: The task function's output for this item + - evaluations: List of item-level evaluation results + - trace_id: Langfuse trace ID for this execution + - dataset_run_id: Dataset run ID (if using Langfuse datasets) + + Note: This list only includes items that were successfully processed. + Failed items are excluded but logged separately. + **kwargs: Additional keyword arguments that may be passed by the framework + + Returns: + Evaluation results in one of these formats: + - Single Evaluation dict: {"name": "avg_accuracy", "value": 0.78, "comment": "..."} + - List of Evaluation dicts: [{"name": "mean", ...}, {"name": "std_dev", ...}] + - Awaitable returning either of the above (for async evaluators) + + Each Evaluation dict should contain: + - name (str): Unique identifier for this run-level metric + - value (int|float|str|bool): The aggregate evaluation result + - comment (str, optional): Human-readable explanation of the metric + - metadata (dict, optional): Additional structured data about the evaluation + + Examples: + Average accuracy calculator: + ```python + def average_accuracy(*, item_results, **kwargs): + if not item_results: + return {"name": "avg_accuracy", "value": 0.0, "comment": "No results"} + + accuracy_values = [] + for result in item_results: + for evaluation in result["evaluations"]: + if evaluation["name"] == "accuracy": + accuracy_values.append(evaluation["value"]) + + if not accuracy_values: + return {"name": "avg_accuracy", "value": None, "comment": "No accuracy evaluations found"} + + avg = sum(accuracy_values) / len(accuracy_values) + return { + "name": "avg_accuracy", + "value": avg, + "comment": f"Average accuracy across {len(accuracy_values)} items: {avg:.2%}" + } + ``` + + Multiple aggregate metrics: + ```python + def statistical_summary(*, item_results, **kwargs): + if not item_results: + return [] + + results = [] + + # Calculate output length statistics + lengths = [len(str(result["output"])) for result in item_results] + results.extend([ + {"name": "avg_output_length", "value": sum(lengths) / len(lengths)}, + {"name": "min_output_length", "value": min(lengths)}, + {"name": "max_output_length", "value": max(lengths)} + ]) + + # Success rate + total_items = len(item_results) # Only successful items are included + results.append({ + "name": "processing_success_rate", + "value": 1.0, # All items in item_results succeeded + "comment": f"Successfully processed {total_items} items" + }) + + return results + ``` + + Async run evaluator with external analysis: + ```python + async def llm_batch_analysis(*, item_results, **kwargs): + # Prepare batch analysis prompt + outputs = [result["output"] for result in item_results] + prompt = f"Analyze these {len(outputs)} outputs for common themes:\n" + prompt += "\n".join(f"{i+1}. {output}" for i, output in enumerate(outputs)) + + response = await openai_client.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": prompt}] + ) + + return { + "name": "thematic_analysis", + "value": response.choices[0].message.content, + "comment": f"LLM analysis of {len(outputs)} outputs" + } + ``` + + Performance distribution analysis: + ```python + def performance_distribution(*, item_results, **kwargs): + # Extract all evaluation scores + all_scores = [] + score_by_metric = {} + + for result in item_results: + for evaluation in result["evaluations"]: + metric_name = evaluation["name"] + value = evaluation["value"] + + if isinstance(value, (int, float)): + all_scores.append(value) + if metric_name not in score_by_metric: + score_by_metric[metric_name] = [] + score_by_metric[metric_name].append(value) + + results = [] + + # Overall score distribution + if all_scores: + import statistics + results.append({ + "name": "score_std_dev", + "value": statistics.stdev(all_scores) if len(all_scores) > 1 else 0, + "comment": f"Standard deviation across all numeric scores" + }) + + # Per-metric statistics + for metric, scores in score_by_metric.items(): + if len(scores) > 1: + results.append({ + "name": f"{metric}_variance", + "value": statistics.variance(scores), + "comment": f"Variance in {metric} across {len(scores)} items" + }) + + return results + ``` + """ + ... + + +def format_experiment_result( + experiment_result: ExperimentResult, + *, + include_item_results: bool = False, +) -> str: + """Format an experiment result for human-readable display. + + Takes an ExperimentResult object and converts it into a nicely formatted + string suitable for console output or logging. The output includes experiment + overview, aggregate statistics, and optionally individual item details. + + Args: + experiment_result: Complete experiment result containing name, description, + item results, run evaluations, and dataset run information. + include_item_results: Whether to include detailed results for each individual + item in the output. When False (default), only shows aggregate statistics. + Set to True to see input/output/scores for every processed item. + + Returns: + A formatted multi-line string containing: + - Experiment name and description + - Number of items processed + - List of evaluation metrics used + - Average scores across all items + - Run-level evaluation results + - Dataset run URL (if available) + - Individual item details (if include_item_results=True) + + Examples: + Basic usage with aggregate results only: + ```python + result = langfuse.run_experiment(...) + print(format_experiment_result(result)) + ``` + + Detailed output including individual items: + ```python + result = langfuse.run_experiment(...) + detailed_report = format_experiment_result( + result, + include_item_results=True + ) + print(detailed_report) + ``` + + Save formatted results to file: + ```python + result = dataset.run_experiment(...) + with open("experiment_report.txt", "w") as f: + f.write(format_experiment_result(result, include_item_results=True)) + ``` + """ + item_results = experiment_result["item_results"] + run_evaluations = experiment_result["run_evaluations"] + dataset_run_url = experiment_result["dataset_run_url"] + + if not item_results: + return "No experiment results to display." + + output = "" + + # Individual results + if include_item_results: + for i, result in enumerate(item_results): + output += f"\n{i + 1}. Item {i + 1}:\n" + + # Input, expected, and actual + item_input = None + if isinstance(result["item"], dict): + item_input = result["item"].get("input") + elif hasattr(result["item"], "input"): + item_input = result["item"].input + + if item_input is not None: + output += f" Input: {_format_value(item_input)}\n" + + expected_output = None + if isinstance(result["item"], dict): + expected_output = result["item"].get("expected_output") + elif hasattr(result["item"], "expected_output"): + expected_output = result["item"].expected_output + + if expected_output is not None: + output += f" Expected: {_format_value(expected_output)}\n" + output += f" Actual: {_format_value(result['output'])}\n" + + # Scores + if result["evaluations"]: + output += " Scores:\n" + for evaluation in result["evaluations"]: + score = evaluation["value"] + if isinstance(score, (int, float)): + score = f"{score:.3f}" + output += f" โ€ข {evaluation['name']}: {score}" + if evaluation.get("comment"): + output += f"\n ๐Ÿ’ญ {evaluation['comment']}" + output += "\n" + + # Trace link + if result.get("trace_id"): + # Note: We'd need the langfuse client to generate the actual URL + output += f"\n Trace ID: {result['trace_id']}\n" + else: + output += f"Individual Results: Hidden ({len(item_results)} items)\n" + output += "๐Ÿ’ก Set include_item_results=True to view them\n" + + # Experiment Overview + output += f"\n{'โ”€' * 50}\n" + output += f"๐Ÿ“Š {experiment_result['name']}" + if experiment_result["description"]: + output += f" - {experiment_result['description']}" + + output += f"\n{len(item_results)} items" + + # Get unique evaluation names + evaluation_names = set() + for result in item_results: + for evaluation in result["evaluations"]: + evaluation_names.add(evaluation["name"]) + + if evaluation_names: + output += "\nEvaluations:" + for eval_name in evaluation_names: + output += f"\n โ€ข {eval_name}" + output += "\n" + + # Average scores + if evaluation_names: + output += "\nAverage Scores:" + for eval_name in evaluation_names: + scores = [] + for result in item_results: + for evaluation in result["evaluations"]: + if evaluation["name"] == eval_name and isinstance( + evaluation["value"], (int, float) + ): + scores.append(evaluation["value"]) + + if scores: + avg = sum(scores) / len(scores) + output += f"\n โ€ข {eval_name}: {avg:.3f}" + output += "\n" + + # Run evaluations + if run_evaluations: + output += "\nRun Evaluations:" + for run_eval in run_evaluations: + score = run_eval["value"] + if isinstance(score, (int, float)): + score = f"{score:.3f}" + output += f"\n โ€ข {run_eval['name']}: {score}" + if run_eval.get("comment"): + output += f"\n ๐Ÿ’ญ {run_eval['comment']}" + output += "\n" + + if dataset_run_url: + output += f"\n๐Ÿ”— Dataset Run:\n {dataset_run_url}" + + return output + + +def _format_value(value: Any) -> str: + """Format a value for display.""" + if isinstance(value, str): + return value[:50] + "..." if len(value) > 50 else value + return str(value) + + +async def _run_evaluator( + evaluator: Union[EvaluatorFunction, RunEvaluatorFunction], **kwargs: Any +) -> List[Evaluation]: + """Run an evaluator function and normalize the result.""" + try: + result = evaluator(**kwargs) + + # Handle async evaluators + if asyncio.iscoroutine(result): + result = await result + + # Normalize to list + if isinstance(result, dict): + return [result] + + elif isinstance(result, list): + return result + + else: + return [] + + except Exception as e: + evaluator_name = getattr(evaluator, "__name__", "unknown_evaluator") + logging.getLogger("langfuse").error(f"Evaluator {evaluator_name} failed: {e}") + return [] + + +async def _run_task(task: TaskFunction, item: ExperimentItem) -> Any: + """Run a task function and handle sync/async.""" + result = task(item=item) + + # Handle async tasks + if asyncio.iscoroutine(result): + result = await result + + return result + + +def create_evaluator_from_autoevals( + autoevals_evaluator: Any, **kwargs: Optional[Dict[str, Any]] +) -> EvaluatorFunction: + """Create a Langfuse evaluator from an autoevals evaluator. + + Args: + autoevals_evaluator: An autoevals evaluator instance + **kwargs: Additional arguments passed to the evaluator + + Returns: + A Langfuse-compatible evaluator function + """ + + def langfuse_evaluator( + *, + input: Any, + output: Any, + expected_output: Any, + metadata: Optional[Dict[str, Any]], + **kwargs: Dict[str, Any], + ) -> Evaluation: + evaluation = autoevals_evaluator( + input=input, output=output, expected=expected_output, **kwargs + ) + + return Evaluation( + name=evaluation.name, value=evaluation.score, metadata=evaluation.metadata + ) + + return langfuse_evaluator diff --git a/tests/test_core_sdk.py b/tests/test_core_sdk.py index 9a758e38a..f29851d84 100644 --- a/tests/test_core_sdk.py +++ b/tests/test_core_sdk.py @@ -1934,8 +1934,8 @@ def test_start_as_current_observation_types(): def test_that_generation_like_properties_are_actually_created(): """Test that generation-like observation types properly support generation properties.""" from langfuse._client.constants import ( - get_observation_types_list, ObservationTypeGenerationLike, + get_observation_types_list, ) langfuse = Langfuse() From e2d08ae48e66a7c8d35b19a7c25efca08a1d268f Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Mon, 15 Sep 2025 17:11:06 +0200 Subject: [PATCH 14/25] push --- langfuse/__init__.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/langfuse/__init__.py b/langfuse/__init__.py index 3449e851f..b2cfa96f6 100644 --- a/langfuse/__init__.py +++ b/langfuse/__init__.py @@ -6,16 +6,16 @@ from ._client.get_client import get_client from ._client.observe import observe from ._client.span import ( - LangfuseEvent, - LangfuseGeneration, - LangfuseSpan, LangfuseAgent, - LangfuseTool, LangfuseChain, LangfuseEmbedding, LangfuseEvaluator, - LangfuseRetriever, + LangfuseEvent, + LangfuseGeneration, LangfuseGuardrail, + LangfuseRetriever, + LangfuseSpan, + LangfuseTool, ) Langfuse = _client_module.Langfuse @@ -36,4 +36,5 @@ "LangfuseEvaluator", "LangfuseRetriever", "LangfuseGuardrail", + "experiment", ] From 07b17b9074f99f79f39b2611c0704c6ee8917ea6 Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Mon, 15 Sep 2025 17:40:37 +0200 Subject: [PATCH 15/25] push --- langfuse/__init__.py | 1 + langfuse/experiment.py | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/langfuse/__init__.py b/langfuse/__init__.py index b2cfa96f6..049d922cd 100644 --- a/langfuse/__init__.py +++ b/langfuse/__init__.py @@ -37,4 +37,5 @@ "LangfuseRetriever", "LangfuseGuardrail", "experiment", + "api", ] diff --git a/langfuse/experiment.py b/langfuse/experiment.py index 249c71d2d..5427f06d9 100644 --- a/langfuse/experiment.py +++ b/langfuse/experiment.py @@ -19,6 +19,8 @@ Union, ) +from langfuse.api import ScoreDataType + if TYPE_CHECKING: from langfuse._client.datasets import DatasetItemClient @@ -113,6 +115,8 @@ class Evaluation(TypedDict, total=False): metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details. + data_type: Optional score data type; one of NUMERIC,CATEGORICAL, or BOOLEAN; default: NUMERIC + config_id: Optional Langfuse score config id Examples: Quantitative accuracy evaluation: @@ -159,6 +163,8 @@ class Evaluation(TypedDict, total=False): value: Union[int, float, str, bool, None] comment: Optional[str] metadata: Optional[Dict[str, Any]] + data_type: Optional[ScoreDataType] + config_id: Optional[str] class ExperimentItemResult(TypedDict): From b01cbd082a9f28d9d175c0d9b99083c3cde7b4cc Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Mon, 15 Sep 2025 17:46:05 +0200 Subject: [PATCH 16/25] push --- langfuse/_client/client.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py index 514e00084..b5479b115 100644 --- a/langfuse/_client/client.py +++ b/langfuse/_client/client.py @@ -82,6 +82,7 @@ Prompt_Text, ) from langfuse.experiment import ( + Evaluation, EvaluatorFunction, ExperimentData, ExperimentItem, @@ -2674,7 +2675,7 @@ async def process_item(item: ExperimentItem) -> dict: valid_results.append(result) # type: ignore # Run experiment-level evaluators - run_evaluations = [] + run_evaluations: List[Evaluation] = [] for run_evaluator in run_evaluators: try: evaluations = await _run_evaluator( @@ -2713,10 +2714,11 @@ async def process_item(item: ExperimentItem) -> dict: if dataset_run_id: self.create_score( dataset_run_id=dataset_run_id, - name=evaluation["name"], - value=evaluation["value"], # type: ignore + name=evaluation.get("name") or "", + value=evaluation.get("value"), # type: ignore comment=evaluation.get("comment"), metadata=evaluation.get("metadata"), + data_type=evaluation.get("data_type"), # type: ignore ) except Exception as e: From cbfcdd43078c002268563a662560709b277b189b Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Tue, 16 Sep 2025 11:05:37 +0200 Subject: [PATCH 17/25] push --- langfuse/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/langfuse/__init__.py b/langfuse/__init__.py index 049d922cd..23f6ac143 100644 --- a/langfuse/__init__.py +++ b/langfuse/__init__.py @@ -38,4 +38,5 @@ "LangfuseGuardrail", "experiment", "api", + "async_api", ] From 009c191c34be2f39b08d8bd8835dfbc0d306896e Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Tue, 16 Sep 2025 11:05:56 +0200 Subject: [PATCH 18/25] push --- langfuse/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/langfuse/__init__.py b/langfuse/__init__.py index 23f6ac143..049d922cd 100644 --- a/langfuse/__init__.py +++ b/langfuse/__init__.py @@ -38,5 +38,4 @@ "LangfuseGuardrail", "experiment", "api", - "async_api", ] From e4a459946dcd581f731f35ec2892bf25d678418a Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Tue, 16 Sep 2025 18:25:03 +0200 Subject: [PATCH 19/25] move to classes --- langfuse/_client/client.py | 76 ++--- langfuse/_client/datasets.py | 15 +- langfuse/experiment.py | 629 ++++++++++++++++++++++------------- tests/test_experiments.py | 168 +++++----- 4 files changed, 527 insertions(+), 361 deletions(-) diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py index b5479b115..5dac439af 100644 --- a/langfuse/_client/client.py +++ b/langfuse/_client/client.py @@ -2543,11 +2543,11 @@ def length_evaluator(*, input, output, expected_output=None, **kwargs): evaluators=[length_evaluator] ) - print(f"Processed {len(result['item_results'])} items") - for item_result in result["item_results"]: - print(f"Input: {item_result['item']['input']}") - print(f"Output: {item_result['output']}") - print(f"Evaluations: {item_result['evaluations']}") + print(f"Processed {len(result.item_results)} items") + for item_result in result.item_results: + print(f"Input: {item_result.item['input']}") + print(f"Output: {item_result.output}") + print(f"Evaluations: {item_result.evaluations}") ``` Advanced experiment with async task and multiple evaluators: @@ -2576,9 +2576,9 @@ def toxicity_evaluator(*, input, output, expected_output=None, **kwargs): def average_accuracy(*, item_results, **kwargs): accuracies = [ - eval["value"] for result in item_results - for eval in result["evaluations"] - if eval["name"] == "accuracy" + eval.value for result in item_results + for eval in result.evaluations + if eval.name == "accuracy" ] return { "name": "average_accuracy", @@ -2656,7 +2656,7 @@ async def _run_experiment_async( semaphore = asyncio.Semaphore(max_concurrency) # Process all items - async def process_item(item: ExperimentItem) -> dict: + async def process_item(item: ExperimentItem) -> ExperimentItemResult: async with semaphore: return await self._process_experiment_item( item, task, evaluators, name, description, metadata @@ -2671,7 +2671,7 @@ async def process_item(item: ExperimentItem) -> dict: for i, result in enumerate(item_results): if isinstance(result, Exception): langfuse_logger.error(f"Item {i} failed: {result}") - elif isinstance(result, dict): + elif isinstance(result, ExperimentItemResult): valid_results.append(result) # type: ignore # Run experiment-level evaluators @@ -2686,9 +2686,7 @@ async def process_item(item: ExperimentItem) -> dict: langfuse_logger.error(f"Run evaluator failed: {e}") # Generate dataset run URL if applicable - dataset_run_id = ( - valid_results[0].get("dataset_run_id") if valid_results else None - ) + dataset_run_id = valid_results[0].dataset_run_id if valid_results else None dataset_run_url = None if dataset_run_id and data: try: @@ -2714,11 +2712,11 @@ async def process_item(item: ExperimentItem) -> dict: if dataset_run_id: self.create_score( dataset_run_id=dataset_run_id, - name=evaluation.get("name") or "", - value=evaluation.get("value"), # type: ignore - comment=evaluation.get("comment"), - metadata=evaluation.get("metadata"), - data_type=evaluation.get("data_type"), # type: ignore + name=evaluation.name or "", + value=evaluation.value, # type: ignore + comment=evaluation.comment, + metadata=evaluation.metadata, + data_type=evaluation.data_type, # type: ignore ) except Exception as e: @@ -2727,14 +2725,14 @@ async def process_item(item: ExperimentItem) -> dict: # Flush scores and traces self.flush() - return { - "name": name, - "description": description, - "item_results": valid_results, - "run_evaluations": run_evaluations, - "dataset_run_id": dataset_run_id, - "dataset_run_url": dataset_run_url, - } + return ExperimentResult( + name=name, + description=description, + item_results=valid_results, + run_evaluations=run_evaluations, + dataset_run_id=dataset_run_id, + dataset_run_url=dataset_run_url, + ) async def _process_experiment_item( self, @@ -2744,7 +2742,7 @@ async def _process_experiment_item( experiment_name: str, experiment_description: Optional[str], experiment_metadata: Dict[str, Any], - ) -> dict: + ) -> ExperimentItemResult: # Execute task with tracing span_name = "experiment-item-run" @@ -2842,22 +2840,24 @@ async def _process_experiment_item( for evaluation in eval_results: self.create_score( trace_id=trace_id, - name=evaluation.get("name", "unknown"), - value=evaluation.get("value", -1), # type: ignore - comment=evaluation.get("comment"), - metadata=evaluation.get("metadata"), + name=evaluation.name or "unknown", + value=evaluation.value + if evaluation.value is not None + else -1, # type: ignore + comment=evaluation.comment, + metadata=evaluation.metadata, ) except Exception as e: langfuse_logger.error(f"Evaluator failed: {e}") - return { - "item": item, - "output": output, - "evaluations": evaluations, - "trace_id": trace_id, - "dataset_run_id": dataset_run_id, - } + return ExperimentItemResult( + item=item, + output=output, + evaluations=evaluations, + trace_id=trace_id, + dataset_run_id=dataset_run_id, + ) except Exception as e: span.update( diff --git a/langfuse/_client/datasets.py b/langfuse/_client/datasets.py index f62c8b0f1..29754a8ce 100644 --- a/langfuse/_client/datasets.py +++ b/langfuse/_client/datasets.py @@ -237,12 +237,21 @@ def run_experiment( Will be combined with individual item metadata. Returns: - ExperimentResult dictionary containing: + ExperimentResult object containing: + - name: The experiment name + - description: Optional experiment description - item_results: Results for each dataset item with outputs and evaluations - run_evaluations: Aggregate evaluation results for the entire run - dataset_run_id: ID of the created dataset run in Langfuse - dataset_run_url: Direct URL to view the experiment results in Langfuse UI + The result object provides a format() method for human-readable output: + ```python + result = dataset.run_experiment(...) + print(result.format()) # Summary view + print(result.format(include_item_results=True)) # Detailed view + ``` + Raises: ValueError: If the dataset has no items or no Langfuse client is available @@ -372,8 +381,8 @@ def content_diversity(*, item_results, **kwargs): # Both experiments are now visible in Langfuse for easy comparison print("Compare results in Langfuse:") - print(f"GPT-4: {result_gpt4['dataset_run_url']}") - print(f"Custom: {result_custom['dataset_run_url']}") + print(f"GPT-4: {result_gpt4.dataset_run_url}") + print(f"Custom: {result_custom.dataset_run_url}") ``` Note: diff --git a/langfuse/experiment.py b/langfuse/experiment.py index 5427f06d9..74926cd31 100644 --- a/langfuse/experiment.py +++ b/langfuse/experiment.py @@ -7,6 +7,7 @@ import asyncio import logging +from dataclasses import dataclass from typing import ( TYPE_CHECKING, Any, @@ -93,12 +94,13 @@ class LocalExperimentItem(TypedDict, total=False): """ -class Evaluation(TypedDict, total=False): - """Structure for evaluation results returned by evaluator functions. +@dataclass(frozen=True) +class Evaluation: + """Represents an evaluation result for an experiment item. - This TypedDict defines the standardized format that all evaluator functions - must return. It provides a consistent structure for storing evaluation metrics - and their metadata across different types of evaluators. + This class provides a strongly-typed way to create evaluation results in evaluator functions. + Users should import this class and return instances instead of dictionaries for better + type safety and IDE support. Attributes: name: Unique identifier for the evaluation metric. Should be descriptive @@ -115,67 +117,128 @@ class Evaluation(TypedDict, total=False): metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details. - data_type: Optional score data type; one of NUMERIC,CATEGORICAL, or BOOLEAN; default: NUMERIC + data_type: Optional score data type; one of NUMERIC, CATEGORICAL, or BOOLEAN; default: NUMERIC config_id: Optional Langfuse score config id Examples: - Quantitative accuracy evaluation: + Basic accuracy evaluation: ```python - accuracy_result: Evaluation = { - "name": "accuracy", - "value": 0.85, - "comment": "85% of responses were correct", - "metadata": {"total_items": 100, "correct_items": 85} - } + from langfuse import Evaluation + + def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): + if not expected_output: + return Evaluation(name="accuracy", value=None, comment="No expected output") + + is_correct = output.strip().lower() == expected_output.strip().lower() + return Evaluation( + name="accuracy", + value=1.0 if is_correct else 0.0, + comment="Correct answer" if is_correct else "Incorrect answer" + ) ``` - Qualitative assessment: + Multi-metric evaluator: ```python - sentiment_result: Evaluation = { - "name": "sentiment", - "value": "positive", - "comment": "Response expresses optimistic viewpoint", - "metadata": {"confidence": 0.92, "model": "sentiment-analyzer-v2"} - } + def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs): + return [ + Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"), + Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"), + Evaluation( + name="quality", + value=0.85, + comment="High quality response", + metadata={"confidence": 0.92, "model": "gpt-4"} + ) + ] ``` - Binary check: + Categorical evaluation: ```python - safety_result: Evaluation = { - "name": "safety_check", - "value": True, - "comment": "Content passes all safety filters" - } + def sentiment_evaluator(*, input, output, **kwargs): + sentiment = analyze_sentiment(output) # Returns "positive", "negative", or "neutral" + return Evaluation( + name="sentiment", + value=sentiment, + comment=f"Response expresses {sentiment} sentiment", + data_type="CATEGORICAL" + ) ``` - Failed evaluation: + Failed evaluation with error handling: ```python - failed_result: Evaluation = { - "name": "external_api_score", - "value": None, - "comment": "External API unavailable", - "metadata": {"error": "timeout", "retry_count": 3} - } + def external_api_evaluator(*, input, output, **kwargs): + try: + score = external_api.evaluate(output) + return Evaluation(name="external_score", value=score) + except Exception as e: + return Evaluation( + name="external_score", + value=None, + comment=f"API unavailable: {e}", + metadata={"error": str(e), "retry_count": 3} + ) ``` + + Note: + This class is immutable (frozen=True) to ensure evaluation results cannot be + accidentally modified after creation. All fields except name and value are optional. """ name: str value: Union[int, float, str, bool, None] - comment: Optional[str] - metadata: Optional[Dict[str, Any]] - data_type: Optional[ScoreDataType] - config_id: Optional[str] + comment: Optional[str] = None + metadata: Optional[Dict[str, Any]] = None + data_type: Optional[ScoreDataType] = None + config_id: Optional[str] = None -class ExperimentItemResult(TypedDict): +@dataclass(frozen=True) +class ExperimentItemResult: """Result structure for individual experiment items. - Args: - item: The original experiment item that was processed - output: The actual output produced by the task - evaluations: List of evaluation results for this item - trace_id: Langfuse trace ID for this item's execution - dataset_run_id: Dataset run ID if this item was part of a Langfuse dataset + This dataclass represents the complete result of processing a single item + during an experiment run, including the original input, task output, + evaluations, and tracing information. + + Attributes: + item: The original experiment item that was processed. Can be either + a dictionary with 'input', 'expected_output', and 'metadata' keys, + or a DatasetItemClient from Langfuse datasets. + output: The actual output produced by the task function for this item. + Can be any type depending on what your task function returns. + evaluations: List of evaluation results for this item. Each evaluation + contains a name, value, optional comment, and optional metadata. + trace_id: Optional Langfuse trace ID for this item's execution. Used + to link the experiment result with the detailed trace in Langfuse UI. + dataset_run_id: Optional dataset run ID if this item was part of a + Langfuse dataset. None for local experiments. + + Examples: + Accessing item result data: + ```python + result = langfuse.run_experiment(...) + for item_result in result.item_results: + print(f"Input: {item_result.item}") + print(f"Output: {item_result.output}") + print(f"Trace: {item_result.trace_id}") + + # Access evaluations + for evaluation in item_result.evaluations: + print(f"{evaluation.name}: {evaluation.value}") + ``` + + Working with different item types: + ```python + # Local experiment item (dict) + if isinstance(item_result.item, dict): + input_data = item_result.item["input"] + expected = item_result.item.get("expected_output") + + # Langfuse dataset item (object with attributes) + else: + input_data = item_result.item.input + expected = item_result.item.expected_output + ``` """ item: ExperimentItem @@ -185,22 +248,291 @@ class ExperimentItemResult(TypedDict): dataset_run_id: Optional[str] -class ExperimentResult(TypedDict): +class ExperimentResult: """Complete result structure for experiment execution. - Args: - item_results: Results from processing each individual data item - run_evaluations: Results from run-level evaluators - dataset_run_id: ID of the dataset run (if using Langfuse datasets) - dataset_run_url: URL to view the dataset run in Langfuse UI + This class encapsulates the complete results of running an experiment on a dataset, + including individual item results, aggregate run-level evaluations, and metadata + about the experiment execution. + + Attributes: + name: The name of the experiment as specified during execution + description: Optional description of the experiment's purpose or methodology + item_results: List of results from processing each individual dataset item, + containing the original item, task output, evaluations, and trace information + run_evaluations: List of aggregate evaluation results computed across all items, + such as average scores, statistical summaries, or cross-item analyses + dataset_run_id: Optional ID of the dataset run in Langfuse (when using Langfuse datasets) + dataset_run_url: Optional direct URL to view the experiment results in Langfuse UI + + Examples: + Basic usage with local dataset: + ```python + result = langfuse.run_experiment( + name="Capital Cities Test", + data=local_data, + task=generate_capital, + evaluators=[accuracy_check] + ) + + print(f"Processed {len(result.item_results)} items") + print(result.format()) # Human-readable summary + + # Access individual results + for item_result in result.item_results: + print(f"Input: {item_result.item}") + print(f"Output: {item_result.output}") + print(f"Scores: {item_result.evaluations}") + ``` + + Usage with Langfuse datasets: + ```python + dataset = langfuse.get_dataset("qa-eval-set") + result = dataset.run_experiment( + name="GPT-4 QA Evaluation", + task=answer_question, + evaluators=[relevance_check, accuracy_check] + ) + + # View in Langfuse UI + if result.dataset_run_url: + print(f"View detailed results: {result.dataset_run_url}") + ``` + + Formatted output: + ```python + # Get summary view + summary = result.format() + print(summary) + + # Get detailed view with individual items + detailed = result.format(include_item_results=True) + with open("experiment_report.txt", "w") as f: + f.write(detailed) + ``` """ - name: str - description: Optional[str] - item_results: List[ExperimentItemResult] - run_evaluations: List[Evaluation] - dataset_run_id: Optional[str] - dataset_run_url: Optional[str] + def __init__( + self, + name: str, + description: Optional[str], + item_results: List[ExperimentItemResult], + run_evaluations: List[Evaluation], + dataset_run_id: Optional[str] = None, + dataset_run_url: Optional[str] = None, + ): + """Initialize an ExperimentResult with the provided data. + + Args: + name: The name of the experiment + description: Optional description of the experiment + item_results: List of results from processing individual dataset items + run_evaluations: List of aggregate evaluation results for the entire run + dataset_run_id: Optional ID of the dataset run (for Langfuse datasets) + dataset_run_url: Optional URL to view results in Langfuse UI + """ + self.name = name + self.description = description + self.item_results = item_results + self.run_evaluations = run_evaluations + self.dataset_run_id = dataset_run_id + self.dataset_run_url = dataset_run_url + + def format(self, *, include_item_results: bool = False) -> str: + r"""Format the experiment result for human-readable display. + + Converts the experiment result into a nicely formatted string suitable for + console output, logging, or reporting. The output includes experiment overview, + aggregate statistics, and optionally individual item details. + + This method provides a comprehensive view of experiment performance including: + - Experiment metadata (name, description, item count) + - List of evaluation metrics used across items + - Average scores computed across all processed items + - Run-level evaluation results (aggregate metrics) + - Links to view detailed results in Langfuse UI (when available) + - Individual item details (when requested) + + Args: + include_item_results: Whether to include detailed results for each individual + item in the formatted output. When False (default), only shows aggregate + statistics and summary information. When True, includes input/output/scores + for every processed item, making the output significantly longer but more + detailed for debugging and analysis purposes. + + Returns: + A formatted multi-line string containing: + - Experiment name and description (if provided) + - Total number of items successfully processed + - List of all evaluation metrics that were applied + - Average scores across all items for each numeric metric + - Run-level evaluation results with comments + - Dataset run URL for viewing in Langfuse UI (if applicable) + - Individual item details including inputs, outputs, and scores (if requested) + + Examples: + Basic usage showing aggregate results only: + ```python + result = langfuse.run_experiment( + name="Capital Cities", + data=dataset, + task=generate_capital, + evaluators=[accuracy_evaluator] + ) + + print(result.format()) + # Output: + # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + # ๐Ÿ“Š Capital Cities + # 100 items + # Evaluations: + # โ€ข accuracy + # Average Scores: + # โ€ข accuracy: 0.850 + ``` + + Detailed output including all individual item results: + ```python + detailed_report = result.format(include_item_results=True) + print(detailed_report) + # Output includes each item: + # 1. Item 1: + # Input: What is the capital of France? + # Expected: Paris + # Actual: The capital of France is Paris. + # Scores: + # โ€ข accuracy: 1.000 + # ๐Ÿ’ญ Correct answer found + # [... continues for all items ...] + ``` + + Saving formatted results to file for reporting: + ```python + with open("experiment_report.txt", "w") as f: + f.write(result.format(include_item_results=True)) + + # Or create summary report + summary = result.format() # Aggregate view only + print(f"Experiment Summary:\\n{summary}") + ``` + + Integration with logging systems: + ```python + import logging + logger = logging.getLogger("experiments") + + # Log summary after experiment + logger.info(f"Experiment completed:\\n{result.format()}") + + # Log detailed results for failed experiments + if any(eval['value'] < threshold for eval in result.run_evaluations): + logger.warning(f"Poor performance detected:\\n{result.format(include_item_results=True)}") + ``` + """ + if not self.item_results: + return "No experiment results to display." + + output = "" + + # Individual results section + if include_item_results: + for i, result in enumerate(self.item_results): + output += f"\\n{i + 1}. Item {i + 1}:\\n" + + # Extract and display input + item_input = None + if isinstance(result.item, dict): + item_input = result.item.get("input") + elif hasattr(result.item, "input"): + item_input = result.item.input + + if item_input is not None: + output += f" Input: {_format_value(item_input)}\\n" + + # Extract and display expected output + expected_output = None + if isinstance(result.item, dict): + expected_output = result.item.get("expected_output") + elif hasattr(result.item, "expected_output"): + expected_output = result.item.expected_output + + if expected_output is not None: + output += f" Expected: {_format_value(expected_output)}\\n" + output += f" Actual: {_format_value(result.output)}\\n" + + # Display evaluation scores + if result.evaluations: + output += " Scores:\\n" + for evaluation in result.evaluations: + score = evaluation.value + if isinstance(score, (int, float)): + score = f"{score:.3f}" + output += f" โ€ข {evaluation.name}: {score}" + if evaluation.comment: + output += f"\\n ๐Ÿ’ญ {evaluation.comment}" + output += "\\n" + + # Display trace link if available + if result.trace_id: + output += f"\\n Trace ID: {result.trace_id}\\n" + else: + output += f"Individual Results: Hidden ({len(self.item_results)} items)\\n" + output += "๐Ÿ’ก Set include_item_results=True to view them\\n" + + # Experiment overview section + output += f"\\n{'โ”€' * 50}\\n" + output += f"๐Ÿ“Š {self.name}" + if self.description: + output += f" - {self.description}" + + output += f"\\n{len(self.item_results)} items" + + # Collect unique evaluation names across all items + evaluation_names = set() + for result in self.item_results: + for evaluation in result.evaluations: + evaluation_names.add(evaluation.name) + + if evaluation_names: + output += "\\nEvaluations:" + for eval_name in evaluation_names: + output += f"\\n โ€ข {eval_name}" + output += "\\n" + + # Calculate and display average scores + if evaluation_names: + output += "\\nAverage Scores:" + for eval_name in evaluation_names: + scores = [] + for result in self.item_results: + for evaluation in result.evaluations: + if evaluation.name == eval_name and isinstance( + evaluation.value, (int, float) + ): + scores.append(evaluation.value) + + if scores: + avg = sum(scores) / len(scores) + output += f"\\n โ€ข {eval_name}: {avg:.3f}" + output += "\\n" + + # Display run-level evaluations + if self.run_evaluations: + output += "\\nRun Evaluations:" + for run_eval in self.run_evaluations: + score = run_eval.value + if isinstance(score, (int, float)): + score = f"{score:.3f}" + output += f"\\n โ€ข {run_eval.name}: {score}" + if run_eval.comment: + output += f"\\n ๐Ÿ’ญ {run_eval.comment}" + output += "\\n" + + # Add dataset run URL if available + if self.dataset_run_url: + output += f"\\n๐Ÿ”— Dataset Run:\\n {self.dataset_run_url}" + + return output class TaskFunction(Protocol): @@ -303,7 +635,7 @@ def __call__( ) -> Union[ Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]] ]: - """Evaluate a task output for quality, correctness, or other metrics. + r"""Evaluate a task output for quality, correctness, or other metrics. This method should implement specific evaluation logic such as accuracy checking, similarity measurement, toxicity detection, fluency assessment, etc. @@ -440,7 +772,7 @@ def __call__( ) -> Union[ Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]] ]: - """Evaluate the entire experiment run with aggregate metrics. + r"""Evaluate the entire experiment run with aggregate metrics. This method should implement aggregate evaluation logic such as computing averages, calculating distributions, finding correlations, detecting patterns @@ -480,9 +812,9 @@ def average_accuracy(*, item_results, **kwargs): accuracy_values = [] for result in item_results: - for evaluation in result["evaluations"]: - if evaluation["name"] == "accuracy": - accuracy_values.append(evaluation["value"]) + for evaluation in result.evaluations: + if evaluation.name == "accuracy": + accuracy_values.append(evaluation.value) if not accuracy_values: return {"name": "avg_accuracy", "value": None, "comment": "No accuracy evaluations found"} @@ -504,7 +836,7 @@ def statistical_summary(*, item_results, **kwargs): results = [] # Calculate output length statistics - lengths = [len(str(result["output"])) for result in item_results] + lengths = [len(str(result.output)) for result in item_results] results.extend([ {"name": "avg_output_length", "value": sum(lengths) / len(lengths)}, {"name": "min_output_length", "value": min(lengths)}, @@ -526,7 +858,7 @@ def statistical_summary(*, item_results, **kwargs): ```python async def llm_batch_analysis(*, item_results, **kwargs): # Prepare batch analysis prompt - outputs = [result["output"] for result in item_results] + outputs = [result.output for result in item_results] prompt = f"Analyze these {len(outputs)} outputs for common themes:\n" prompt += "\n".join(f"{i+1}. {output}" for i, output in enumerate(outputs)) @@ -550,9 +882,9 @@ def performance_distribution(*, item_results, **kwargs): score_by_metric = {} for result in item_results: - for evaluation in result["evaluations"]: - metric_name = evaluation["name"] - value = evaluation["value"] + for evaluation in result.evaluations: + metric_name = evaluation.name + value = evaluation.value if isinstance(value, (int, float)): all_scores.append(value) @@ -586,167 +918,6 @@ def performance_distribution(*, item_results, **kwargs): ... -def format_experiment_result( - experiment_result: ExperimentResult, - *, - include_item_results: bool = False, -) -> str: - """Format an experiment result for human-readable display. - - Takes an ExperimentResult object and converts it into a nicely formatted - string suitable for console output or logging. The output includes experiment - overview, aggregate statistics, and optionally individual item details. - - Args: - experiment_result: Complete experiment result containing name, description, - item results, run evaluations, and dataset run information. - include_item_results: Whether to include detailed results for each individual - item in the output. When False (default), only shows aggregate statistics. - Set to True to see input/output/scores for every processed item. - - Returns: - A formatted multi-line string containing: - - Experiment name and description - - Number of items processed - - List of evaluation metrics used - - Average scores across all items - - Run-level evaluation results - - Dataset run URL (if available) - - Individual item details (if include_item_results=True) - - Examples: - Basic usage with aggregate results only: - ```python - result = langfuse.run_experiment(...) - print(format_experiment_result(result)) - ``` - - Detailed output including individual items: - ```python - result = langfuse.run_experiment(...) - detailed_report = format_experiment_result( - result, - include_item_results=True - ) - print(detailed_report) - ``` - - Save formatted results to file: - ```python - result = dataset.run_experiment(...) - with open("experiment_report.txt", "w") as f: - f.write(format_experiment_result(result, include_item_results=True)) - ``` - """ - item_results = experiment_result["item_results"] - run_evaluations = experiment_result["run_evaluations"] - dataset_run_url = experiment_result["dataset_run_url"] - - if not item_results: - return "No experiment results to display." - - output = "" - - # Individual results - if include_item_results: - for i, result in enumerate(item_results): - output += f"\n{i + 1}. Item {i + 1}:\n" - - # Input, expected, and actual - item_input = None - if isinstance(result["item"], dict): - item_input = result["item"].get("input") - elif hasattr(result["item"], "input"): - item_input = result["item"].input - - if item_input is not None: - output += f" Input: {_format_value(item_input)}\n" - - expected_output = None - if isinstance(result["item"], dict): - expected_output = result["item"].get("expected_output") - elif hasattr(result["item"], "expected_output"): - expected_output = result["item"].expected_output - - if expected_output is not None: - output += f" Expected: {_format_value(expected_output)}\n" - output += f" Actual: {_format_value(result['output'])}\n" - - # Scores - if result["evaluations"]: - output += " Scores:\n" - for evaluation in result["evaluations"]: - score = evaluation["value"] - if isinstance(score, (int, float)): - score = f"{score:.3f}" - output += f" โ€ข {evaluation['name']}: {score}" - if evaluation.get("comment"): - output += f"\n ๐Ÿ’ญ {evaluation['comment']}" - output += "\n" - - # Trace link - if result.get("trace_id"): - # Note: We'd need the langfuse client to generate the actual URL - output += f"\n Trace ID: {result['trace_id']}\n" - else: - output += f"Individual Results: Hidden ({len(item_results)} items)\n" - output += "๐Ÿ’ก Set include_item_results=True to view them\n" - - # Experiment Overview - output += f"\n{'โ”€' * 50}\n" - output += f"๐Ÿ“Š {experiment_result['name']}" - if experiment_result["description"]: - output += f" - {experiment_result['description']}" - - output += f"\n{len(item_results)} items" - - # Get unique evaluation names - evaluation_names = set() - for result in item_results: - for evaluation in result["evaluations"]: - evaluation_names.add(evaluation["name"]) - - if evaluation_names: - output += "\nEvaluations:" - for eval_name in evaluation_names: - output += f"\n โ€ข {eval_name}" - output += "\n" - - # Average scores - if evaluation_names: - output += "\nAverage Scores:" - for eval_name in evaluation_names: - scores = [] - for result in item_results: - for evaluation in result["evaluations"]: - if evaluation["name"] == eval_name and isinstance( - evaluation["value"], (int, float) - ): - scores.append(evaluation["value"]) - - if scores: - avg = sum(scores) / len(scores) - output += f"\n โ€ข {eval_name}: {avg:.3f}" - output += "\n" - - # Run evaluations - if run_evaluations: - output += "\nRun Evaluations:" - for run_eval in run_evaluations: - score = run_eval["value"] - if isinstance(score, (int, float)): - score = f"{score:.3f}" - output += f"\n โ€ข {run_eval['name']}: {score}" - if run_eval.get("comment"): - output += f"\n ๐Ÿ’ญ {run_eval['comment']}" - output += "\n" - - if dataset_run_url: - output += f"\n๐Ÿ”— Dataset Run:\n {dataset_run_url}" - - return output - - def _format_value(value: Any) -> str: """Format a value for display.""" if isinstance(value, str): @@ -766,7 +937,7 @@ async def _run_evaluator( result = await result # Normalize to list - if isinstance(result, dict): + if isinstance(result, (dict, Evaluation)): return [result] elif isinstance(result, list): @@ -811,7 +982,7 @@ def langfuse_evaluator( output: Any, expected_output: Any, metadata: Optional[Dict[str, Any]], - **kwargs: Dict[str, Any], + **langfuse_kwargs: Dict[str, Any], ) -> Evaluation: evaluation = autoevals_evaluator( input=input, output=output, expected=expected_output, **kwargs diff --git a/tests/test_experiments.py b/tests/test_experiments.py index c278243ab..d6ec67369 100644 --- a/tests/test_experiments.py +++ b/tests/test_experiments.py @@ -6,7 +6,7 @@ import pytest from langfuse import get_client -from langfuse._client.experiments import ( +from langfuse.experiment import ( Evaluation, ExperimentData, ExperimentItem, @@ -37,29 +37,25 @@ def mock_task(*, item: ExperimentItem, **kwargs: Dict[str, Any]): def simple_evaluator(*, input, output, expected_output=None, **kwargs): """Return output length.""" - return Evaluation(**{"name": "length_check", "value": len(output)}) + return Evaluation(name="length_check", value=len(output)) def factuality_evaluator(*, input, output, expected_output=None, **kwargs): """Mock factuality evaluator.""" # Simple mock: check if expected output is in the output if expected_output and expected_output.lower() in output.lower(): - return Evaluation( - **{"name": "factuality", "value": 1.0, "comment": "Correct answer found"} - ) - return Evaluation( - **{"name": "factuality", "value": 0.0, "comment": "Incorrect answer"} - ) + return Evaluation(name="factuality", value=1.0, comment="Correct answer found") + return Evaluation(name="factuality", value=0.0, comment="Incorrect answer") def run_evaluator_average_length(*, item_results: List[ExperimentItemResult], **kwargs): """Run evaluator that calculates average output length.""" if not item_results: - return Evaluation(**{"name": "average_length", "value": 0}) + return Evaluation(name="average_length", value=0) - avg_length = sum(len(r["output"]) for r in item_results) / len(item_results) + avg_length = sum(len(r.output) for r in item_results) / len(item_results) - return Evaluation(**{"name": "average_length", "value": avg_length}) + return Evaluation(name="average_length", value=avg_length) # Basic Functionality Tests @@ -77,20 +73,20 @@ def test_run_experiment_on_local_dataset(sample_dataset): ) # Validate basic result structure - assert len(result["item_results"]) == 3 - assert len(result["run_evaluations"]) == 1 - assert result["run_evaluations"][0]["name"] == "average_length" - assert result["dataset_run_id"] is None # No dataset_run_id for local datasets + assert len(result.item_results) == 3 + assert len(result.run_evaluations) == 1 + assert result.run_evaluations[0].name == "average_length" + assert result.dataset_run_id is None # No dataset_run_id for local datasets # Validate item results structure - for item_result in result["item_results"]: - assert "output" in item_result - assert "evaluations" in item_result - assert "trace_id" in item_result + for item_result in result.item_results: + assert hasattr(item_result, "output") + assert hasattr(item_result, "evaluations") + assert hasattr(item_result, "trace_id") assert ( - item_result["dataset_run_id"] is None + item_result.dataset_run_id is None ) # No dataset_run_id for local datasets - assert len(item_result["evaluations"]) == 2 # Both evaluators should run + assert len(item_result.evaluations) == 2 # Both evaluators should run # Flush and wait for server processing langfuse_client.flush() @@ -101,8 +97,8 @@ def test_run_experiment_on_local_dataset(sample_dataset): expected_inputs = ["Germany", "France", "Spain"] expected_outputs = ["Capital of Germany", "Capital of France", "Capital of Spain"] - for i, item_result in enumerate(result["item_results"]): - trace_id = item_result["trace_id"] + for i, item_result in enumerate(result.item_results): + trace_id = item_result.trace_id assert trace_id is not None, f"Item {i} should have a trace_id" # Fetch trace from API @@ -173,9 +169,9 @@ def test_run_experiment_on_langfuse_dataset(): ) # Should have dataset run ID for Langfuse datasets - assert result["dataset_run_id"] is not None - assert len(result["item_results"]) == 2 - assert all(item["dataset_run_id"] is not None for item in result["item_results"]) + assert result.dataset_run_id is not None + assert len(result.item_results) == 2 + assert all(item.dataset_run_id is not None for item in result.item_results) # Flush and wait for server processing langfuse_client.flush() @@ -188,13 +184,13 @@ def test_run_experiment_on_langfuse_dataset(): # Validate traces are correctly persisted with input/output/metadata expected_data = {"Germany": "Capital of Germany", "France": "Capital of France"} - dataset_run_id = result["dataset_run_id"] + dataset_run_id = result.dataset_run_id # Create a mapping from dataset item ID to dataset item for validation dataset_item_map = {item.id: item for item in dataset.items} - for i, item_result in enumerate(result["item_results"]): - trace_id = item_result["trace_id"] + for i, item_result in enumerate(result.item_results): + trace_id = item_result.trace_id assert trace_id is not None, f"Item {i} should have a trace_id" # Fetch trace from API @@ -283,7 +279,7 @@ def test_run_experiment_on_langfuse_dataset(): run_item_trace_ids = { item.trace_id for item in dataset_run_items.data if item.trace_id } - result_trace_ids = {item["trace_id"] for item in result["item_results"]} + result_trace_ids = {item.trace_id for item in result.item_results} assert run_item_trace_ids == result_trace_ids, ( f"Dataset run items should link to the same traces as experiment results. " @@ -300,7 +296,7 @@ def failing_evaluator(**kwargs): raise Exception("Evaluator failed") def working_evaluator(**kwargs): - return Evaluation(**{"name": "working_eval", "value": 1.0}) + return Evaluation(name="working_eval", value=1.0) result = langfuse_client.run_experiment( name="Error test", @@ -310,14 +306,14 @@ def working_evaluator(**kwargs): ) # Should complete with only working evaluator - assert len(result["item_results"]) == 1 + assert len(result.item_results) == 1 # Only the working evaluator should have produced results assert ( len( [ eval - for eval in result["item_results"][0]["evaluations"] - if eval["name"] == "working_eval" + for eval in result.item_results[0].evaluations + if eval.name == "working_eval" ] ) == 1 @@ -345,7 +341,7 @@ def working_task(item): ) # Should complete but with no valid results since all tasks failed - assert len(result["item_results"]) == 0 + assert len(result.item_results) == 0 langfuse_client.flush() time.sleep(1) @@ -366,8 +362,8 @@ def failing_run_evaluator(**kwargs): ) # Should complete but run evaluations should be empty - assert len(result["item_results"]) == 1 - assert len(result["run_evaluations"]) == 0 + assert len(result.item_results) == 1 + assert len(result.run_evaluations) == 0 langfuse_client.flush() time.sleep(1) @@ -385,8 +381,8 @@ def test_empty_dataset_handling(): run_evaluators=[run_evaluator_average_length], ) - assert len(result["item_results"]) == 0 - assert len(result["run_evaluations"]) == 1 # Run evaluators still execute + assert len(result.item_results) == 0 + assert len(result.run_evaluations) == 1 # Run evaluators still execute langfuse_client.flush() time.sleep(1) @@ -409,10 +405,10 @@ def test_dataset_with_missing_fields(): ) # Should handle missing fields gracefully - assert len(result["item_results"]) == 3 - for item_result in result["item_results"]: - assert "trace_id" in item_result - assert "output" in item_result + assert len(result.item_results) == 3 + for item_result in result.item_results: + assert hasattr(item_result, "trace_id") + assert hasattr(item_result, "output") langfuse_client.flush() time.sleep(1) @@ -430,14 +426,14 @@ def test_large_dataset_with_concurrency(): name="Large dataset test", data=large_dataset, task=lambda **kwargs: f"Processed {kwargs['item']}", - evaluators=[lambda **kwargs: {"name": "simple_eval", "value": 1.0}], + evaluators=[lambda **kwargs: Evaluation(name="simple_eval", value=1.0)], max_concurrency=5, ) - assert len(result["item_results"]) == 20 - for item_result in result["item_results"]: - assert len(item_result["evaluations"]) == 1 - assert "trace_id" in item_result + assert len(result.item_results) == 20 + for item_result in result.item_results: + assert len(item_result.evaluations) == 1 + assert hasattr(item_result, "trace_id") langfuse_client.flush() time.sleep(3) @@ -449,9 +445,7 @@ def test_single_evaluation_return(): langfuse_client = get_client() def single_evaluator(**kwargs): - return Evaluation( - **{"name": "single_eval", "value": 1, "comment": "Single evaluation"} - ) + return Evaluation(name="single_eval", value=1, comment="Single evaluation") result = langfuse_client.run_experiment( name="Single evaluation test", @@ -460,9 +454,9 @@ def single_evaluator(**kwargs): evaluators=[single_evaluator], ) - assert len(result["item_results"]) == 1 - assert len(result["item_results"][0]["evaluations"]) == 1 - assert result["item_results"][0]["evaluations"][0]["name"] == "single_eval" + assert len(result.item_results) == 1 + assert len(result.item_results[0].evaluations) == 1 + assert result.item_results[0].evaluations[0].name == "single_eval" langfuse_client.flush() time.sleep(1) @@ -478,9 +472,9 @@ def test_no_evaluators(): task=lambda **kwargs: "result", ) - assert len(result["item_results"]) == 1 - assert len(result["item_results"][0]["evaluations"]) == 0 - assert len(result["run_evaluations"]) == 0 + assert len(result.item_results) == 1 + assert len(result.item_results[0].evaluations) == 0 + assert len(result.run_evaluations) == 0 langfuse_client.flush() time.sleep(1) @@ -492,11 +486,7 @@ def test_only_run_evaluators(): def run_only_evaluator(**kwargs): return Evaluation( - **{ - "name": "run_only_eval", - "value": 10, - "comment": "Run-level evaluation", - } + name="run_only_eval", value=10, comment="Run-level evaluation" ) result = langfuse_client.run_experiment( @@ -506,10 +496,10 @@ def run_only_evaluator(**kwargs): run_evaluators=[run_only_evaluator], ) - assert len(result["item_results"]) == 1 - assert len(result["item_results"][0]["evaluations"]) == 0 # No item evaluations - assert len(result["run_evaluations"]) == 1 - assert result["run_evaluations"][0]["name"] == "run_only_eval" + assert len(result.item_results) == 1 + assert len(result.item_results[0].evaluations) == 0 # No item evaluations + assert len(result.run_evaluations) == 1 + assert result.run_evaluations[0].name == "run_only_eval" langfuse_client.flush() time.sleep(1) @@ -520,13 +510,13 @@ def test_different_data_types(): langfuse_client = get_client() def number_evaluator(**kwargs): - return Evaluation(**{"name": "number_eval", "value": 42}) + return Evaluation(name="number_eval", value=42) def string_evaluator(**kwargs): - return Evaluation(**{"name": "string_eval", "value": "excellent"}) + return Evaluation(name="string_eval", value="excellent") def boolean_evaluator(**kwargs): - return Evaluation(**{"name": "boolean_eval", "value": True}) + return Evaluation(name="boolean_eval", value=True) result = langfuse_client.run_experiment( name="Different data types test", @@ -535,10 +525,10 @@ def boolean_evaluator(**kwargs): evaluators=[number_evaluator, string_evaluator, boolean_evaluator], ) - evaluations = result["item_results"][0]["evaluations"] + evaluations = result.item_results[0].evaluations assert len(evaluations) == 3 - eval_by_name = {e["name"]: e["value"] for e in evaluations} + eval_by_name = {e.name: e.value for e in evaluations} assert eval_by_name["number_eval"] == 42 assert eval_by_name["string_eval"] == "excellent" assert eval_by_name["boolean_eval"] is True @@ -566,20 +556,16 @@ def test_scores_are_persisted(): def test_evaluator(**kwargs): return Evaluation( - **{ - "name": "persistence_test", - "value": 0.85, - "comment": "Test evaluation for persistence", - } + name="persistence_test", + value=0.85, + comment="Test evaluation for persistence", ) def test_run_evaluator(**kwargs): return Evaluation( - **{ - "name": "persistence_run_test", - "value": 0.9, - "comment": "Test run evaluation for persistence", - } + name="persistence_run_test", + value=0.9, + comment="Test run evaluation for persistence", ) result = dataset.run_experiment( @@ -590,9 +576,9 @@ def test_run_evaluator(**kwargs): run_evaluators=[test_run_evaluator], ) - assert result["dataset_run_id"] is not None - assert len(result["item_results"]) == 1 - assert len(result["run_evaluations"]) == 1 + assert result.dataset_run_id is not None + assert len(result.item_results) == 1 + assert len(result.run_evaluations) == 1 langfuse_client.flush() time.sleep(3) @@ -650,9 +636,9 @@ def test_multiple_experiments_on_same_dataset(): time.sleep(2) # Both experiments should have different run IDs - assert result1["dataset_run_id"] is not None - assert result2["dataset_run_id"] is not None - assert result1["dataset_run_id"] != result2["dataset_run_id"] + assert result1.dataset_run_id is not None + assert result2.dataset_run_id is not None + assert result1.dataset_run_id != result2.dataset_run_id # Verify both runs exist in database api = get_api() @@ -679,10 +665,10 @@ def test_format_experiment_results_basic(): ) # Basic validation that result structure is correct for formatting - assert len(result["item_results"]) == 1 - assert len(result["run_evaluations"]) == 1 - assert "trace_id" in result["item_results"][0] - assert "evaluations" in result["item_results"][0] + assert len(result.item_results) == 1 + assert len(result.run_evaluations) == 1 + assert hasattr(result.item_results[0], "trace_id") + assert hasattr(result.item_results[0], "evaluations") langfuse_client.flush() time.sleep(1) From fbe54976dc2d9a850adf2ff5a81a4e72617d7ef4 Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Tue, 16 Sep 2025 18:58:16 +0200 Subject: [PATCH 20/25] move to classes --- langfuse/__init__.py | 3 ++ langfuse/_client/client.py | 6 +-- langfuse/experiment.py | 91 ++++++++++++++++++++++++++++---------- 3 files changed, 73 insertions(+), 27 deletions(-) diff --git a/langfuse/__init__.py b/langfuse/__init__.py index 049d922cd..b2b73b54b 100644 --- a/langfuse/__init__.py +++ b/langfuse/__init__.py @@ -1,5 +1,7 @@ """.. include:: ../README.md""" +from langfuse.experiment import Evaluation + from ._client import client as _client_module from ._client.attributes import LangfuseOtelSpanAttributes from ._client.constants import ObservationTypeLiteral @@ -36,6 +38,7 @@ "LangfuseEvaluator", "LangfuseRetriever", "LangfuseGuardrail", + "Evaluation", "experiment", "api", ] diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py index 5dac439af..3c7558465 100644 --- a/langfuse/_client/client.py +++ b/langfuse/_client/client.py @@ -2840,10 +2840,8 @@ async def _process_experiment_item( for evaluation in eval_results: self.create_score( trace_id=trace_id, - name=evaluation.name or "unknown", - value=evaluation.value - if evaluation.value is not None - else -1, # type: ignore + name=evaluation.name, + value=evaluation.value or -1, comment=evaluation.comment, metadata=evaluation.metadata, ) diff --git a/langfuse/experiment.py b/langfuse/experiment.py index 74926cd31..62a15ac23 100644 --- a/langfuse/experiment.py +++ b/langfuse/experiment.py @@ -7,7 +7,6 @@ import asyncio import logging -from dataclasses import dataclass from typing import ( TYPE_CHECKING, Any, @@ -94,13 +93,11 @@ class LocalExperimentItem(TypedDict, total=False): """ -@dataclass(frozen=True) class Evaluation: - """Represents an evaluation result for an experiment item. + """Represents an evaluation result for an experiment item or an entire experiment run. This class provides a strongly-typed way to create evaluation results in evaluator functions. - Users should import this class and return instances instead of dictionaries for better - type safety and IDE support. + Users must use keyword arguments when instantiating this class. Attributes: name: Unique identifier for the evaluation metric. Should be descriptive @@ -117,7 +114,7 @@ class Evaluation: metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details. - data_type: Optional score data type; one of NUMERIC, CATEGORICAL, or BOOLEAN; default: NUMERIC + data_type: Optional score data type, required if value is not NUMERIC; one of NUMERIC, CATEGORICAL, or BOOLEAN; default: NUMERIC config_id: Optional Langfuse score config id Examples: @@ -180,25 +177,47 @@ def external_api_evaluator(*, input, output, **kwargs): ``` Note: - This class is immutable (frozen=True) to ensure evaluation results cannot be - accidentally modified after creation. All fields except name and value are optional. + All arguments must be passed as keywords. Positional arguments are not allowed + to ensure code clarity and prevent errors from argument reordering. """ - name: str - value: Union[int, float, str, bool, None] - comment: Optional[str] = None - metadata: Optional[Dict[str, Any]] = None - data_type: Optional[ScoreDataType] = None - config_id: Optional[str] = None + def __init__( + self, + *, + name: str, + value: Union[int, float, str, bool, None], + comment: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + data_type: Optional[ScoreDataType] = None, + config_id: Optional[str] = None, + ): + """Initialize an Evaluation with the provided data. + + Args: + name: Unique identifier for the evaluation metric + value: The evaluation score or result + comment: Optional human-readable explanation of the result + metadata: Optional structured metadata about the evaluation process + data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN) + config_id: Optional Langfuse score config id + + Note: + All arguments must be provided as keywords. Positional arguments will raise a TypeError. + """ + self.name = name + self.value = value + self.comment = comment + self.metadata = metadata + self.data_type = data_type + self.config_id = config_id -@dataclass(frozen=True) class ExperimentItemResult: """Result structure for individual experiment items. - This dataclass represents the complete result of processing a single item + This class represents the complete result of processing a single item during an experiment run, including the original input, task output, - evaluations, and tracing information. + evaluations, and tracing information. Users must use keyword arguments when instantiating this class. Attributes: item: The original experiment item that was processed. Can be either @@ -239,13 +258,38 @@ class ExperimentItemResult: input_data = item_result.item.input expected = item_result.item.expected_output ``` + + Note: + All arguments must be passed as keywords. Positional arguments are not allowed + to ensure code clarity and prevent errors from argument reordering. """ - item: ExperimentItem - output: Any - evaluations: List[Evaluation] - trace_id: Optional[str] - dataset_run_id: Optional[str] + def __init__( + self, + *, + item: ExperimentItem, + output: Any, + evaluations: List[Evaluation], + trace_id: Optional[str], + dataset_run_id: Optional[str], + ): + """Initialize an ExperimentItemResult with the provided data. + + Args: + item: The original experiment item that was processed + output: The actual output produced by the task function for this item + evaluations: List of evaluation results for this item + trace_id: Optional Langfuse trace ID for this item's execution + dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset + + Note: + All arguments must be provided as keywords. Positional arguments will raise a TypeError. + """ + self.item = item + self.output = output + self.evaluations = evaluations + self.trace_id = trace_id + self.dataset_run_id = dataset_run_id class ExperimentResult: @@ -314,6 +358,7 @@ class ExperimentResult: def __init__( self, + *, name: str, description: Optional[str], item_results: List[ExperimentItemResult], @@ -938,7 +983,7 @@ async def _run_evaluator( # Normalize to list if isinstance(result, (dict, Evaluation)): - return [result] + return [result] # type: ignore elif isinstance(result, list): return result From 36ca2c20015974deeb2597928aaacd0a2bad1b04 Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Tue, 16 Sep 2025 19:00:36 +0200 Subject: [PATCH 21/25] add comment metadata --- langfuse/_client/client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py index 3c7558465..ccafd9bd2 100644 --- a/langfuse/_client/client.py +++ b/langfuse/_client/client.py @@ -2507,6 +2507,7 @@ def run_experiment( API rate limits and system resources. metadata: Optional metadata dictionary to attach to all experiment traces. This metadata will be included in every trace created during the experiment. + If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too. Returns: ExperimentResult dictionary containing: From 32cbe0255e8c9698f280bb92708a71147581d93c Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Tue, 16 Sep 2025 23:05:13 +0200 Subject: [PATCH 22/25] add run_name --- langfuse/_client/client.py | 31 ++++++++++++++++--- langfuse/_client/datasets.py | 23 +++++++++----- langfuse/_client/span.py | 26 ++++++++-------- langfuse/experiment.py | 58 ++++++++++++++++++++---------------- tests/test_experiments.py | 28 ++++++++--------- 5 files changed, 99 insertions(+), 67 deletions(-) diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py index 9e324b0a3..86085ebbc 100644 --- a/langfuse/_client/client.py +++ b/langfuse/_client/client.py @@ -2463,6 +2463,7 @@ def run_experiment( self, *, name: str, + run_name: Optional[str] = None, description: Optional[str] = None, data: ExperimentData, task: TaskFunction, @@ -2487,7 +2488,10 @@ def run_experiment( Args: name: Human-readable name for the experiment. Used for identification - in the Langfuse UI and for dataset run naming if using Langfuse datasets. + in the Langfuse UI. + run_name: Optional exact name for the experiment run. If provided, this will be + used as the exact dataset run name if the `data` contains Langfuse dataset items. + If not provided, this will default to the experiment name appended with an ISO timestamp. description: Optional description explaining the experiment's purpose, methodology, or expected outcomes. data: Array of data items to process. Can be either: @@ -2628,6 +2632,9 @@ def average_accuracy(*, item_results, **kwargs): run_async_safely( self._run_experiment_async( name=name, + run_name=self._create_experiment_run_name( + name=name, run_name=run_name + ), description=description, data=data, task=task, @@ -2643,6 +2650,7 @@ async def _run_experiment_async( self, *, name: str, + run_name: str, description: Optional[str], data: ExperimentData, task: TaskFunction, @@ -2651,7 +2659,9 @@ async def _run_experiment_async( max_concurrency: int, metadata: Dict[str, Any], ) -> ExperimentResult: - langfuse_logger.debug(f"Starting experiment '{name}' with {len(data)} items") + langfuse_logger.debug( + f"Starting experiment '{name}' run '{run_name}' with {len(data)} items" + ) # Set up concurrency control semaphore = asyncio.Semaphore(max_concurrency) @@ -2660,7 +2670,7 @@ async def _run_experiment_async( async def process_item(item: ExperimentItem) -> ExperimentItemResult: async with semaphore: return await self._process_experiment_item( - item, task, evaluators, name, description, metadata + item, task, evaluators, name, run_name, description, metadata ) # Run all items concurrently @@ -2728,6 +2738,7 @@ async def process_item(item: ExperimentItem) -> ExperimentItemResult: return ExperimentResult( name=name, + run_name=run_name, description=description, item_results=valid_results, run_evaluations=run_evaluations, @@ -2741,6 +2752,7 @@ async def _process_experiment_item( task: Callable, evaluators: List[Callable], experiment_name: str, + experiment_run_name: str, experiment_description: Optional[str], experiment_metadata: Dict[str, Any], ) -> ExperimentItemResult: @@ -2764,6 +2776,7 @@ async def _process_experiment_item( final_metadata = { "experiment_name": experiment_name, + "experiment_run_name": experiment_run_name, **experiment_metadata, } @@ -2796,7 +2809,7 @@ async def _process_experiment_item( dataset_run_item = self.api.dataset_run_items.create( request=CreateDatasetRunItemRequest( - runName=experiment_name, + runName=experiment_run_name, runDescription=experiment_description, metadata=experiment_metadata, datasetItemId=item.id, # type: ignore @@ -2864,6 +2877,16 @@ async def _process_experiment_item( ) raise e + def _create_experiment_run_name( + self, *, name: Optional[str] = None, run_name: Optional[str] = None + ) -> str: + if run_name: + return run_name + + iso_timestamp = _get_timestamp().isoformat().replace("+00:00", "Z") + + return f"{name} - {iso_timestamp}" + def auth_check(self) -> bool: """Check if the provided credentials (public and secret key) are valid. diff --git a/langfuse/_client/datasets.py b/langfuse/_client/datasets.py index 29754a8ce..beb1248ba 100644 --- a/langfuse/_client/datasets.py +++ b/langfuse/_client/datasets.py @@ -6,6 +6,7 @@ from langfuse.experiment import ( EvaluatorFunction, + ExperimentResult, RunEvaluatorFunction, TaskFunction, ) @@ -199,13 +200,14 @@ def run_experiment( self, *, name: str, + run_name: Optional[str] = None, description: Optional[str] = None, task: TaskFunction, evaluators: List[EvaluatorFunction] = [], run_evaluators: List[RunEvaluatorFunction] = [], max_concurrency: int = 50, metadata: Optional[Dict[str, Any]] = None, - ) -> Any: + ) -> ExperimentResult: """Run an experiment on this Langfuse dataset with automatic tracking. This is a convenience method that runs an experiment using all items in this @@ -222,6 +224,9 @@ def run_experiment( Args: name: Human-readable name for the experiment run. This will be used as the dataset run name in Langfuse for tracking and identification. + run_name: Optional exact name for the dataset run. If provided, this will be + used as the exact dataset run name in Langfuse. If not provided, this will + default to the experiment name appended with an ISO timestamp. description: Optional description of the experiment's purpose, methodology, or what you're testing. Appears in the Langfuse UI for context. task: Function that processes each dataset item and returns output. @@ -238,12 +243,13 @@ def run_experiment( Returns: ExperimentResult object containing: - - name: The experiment name - - description: Optional experiment description - - item_results: Results for each dataset item with outputs and evaluations - - run_evaluations: Aggregate evaluation results for the entire run - - dataset_run_id: ID of the created dataset run in Langfuse - - dataset_run_url: Direct URL to view the experiment results in Langfuse UI + - name: The experiment name. + - run_name: The experiment run name (equivalent to the dataset run name). + - description: Optional experiment description. + - item_results: Results for each dataset item with outputs and evaluations. + - run_evaluations: Aggregate evaluation results for the entire run. + - dataset_run_id: ID of the created dataset run in Langfuse. + - dataset_run_url: Direct URL to view the experiment results in Langfuse UI. The result object provides a format() method for human-readable output: ```python @@ -253,7 +259,7 @@ def run_experiment( ``` Raises: - ValueError: If the dataset has no items or no Langfuse client is available + ValueError: If the dataset has no items or no Langfuse client is available. Examples: Basic dataset experiment: @@ -400,6 +406,7 @@ def content_diversity(*, item_results, **kwargs): return langfuse_client.run_experiment( name=name, + run_name=run_name, description=description, data=self.items, task=task, diff --git a/langfuse/_client/span.py b/langfuse/_client/span.py index 68c1e8c63..9fa9c7489 100644 --- a/langfuse/_client/span.py +++ b/langfuse/_client/span.py @@ -1468,19 +1468,19 @@ def start_as_current_generation( return self.start_as_current_observation( name=name, as_type="generation", - input=input, - output=output, - metadata=metadata, - version=version, - level=level, - status_message=status_message, - completion_start_time=completion_start_time, - model=model, - model_parameters=model_parameters, - usage_details=usage_details, - cost_details=cost_details, - prompt=prompt, - ) + input=input, + output=output, + metadata=metadata, + version=version, + level=level, + status_message=status_message, + completion_start_time=completion_start_time, + model=model, + model_parameters=model_parameters, + usage_details=usage_details, + cost_details=cost_details, + prompt=prompt, + ) def create_event( self, diff --git a/langfuse/experiment.py b/langfuse/experiment.py index 62a15ac23..f4c913c37 100644 --- a/langfuse/experiment.py +++ b/langfuse/experiment.py @@ -114,8 +114,9 @@ class Evaluation: metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details. - data_type: Optional score data type, required if value is not NUMERIC; one of NUMERIC, CATEGORICAL, or BOOLEAN; default: NUMERIC - config_id: Optional Langfuse score config id + data_type: Optional score data type. Required if value is not NUMERIC. + One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC. + config_id: Optional Langfuse score config ID. Examples: Basic accuracy evaluation: @@ -194,12 +195,12 @@ def __init__( """Initialize an Evaluation with the provided data. Args: - name: Unique identifier for the evaluation metric - value: The evaluation score or result - comment: Optional human-readable explanation of the result - metadata: Optional structured metadata about the evaluation process - data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN) - config_id: Optional Langfuse score config id + name: Unique identifier for the evaluation metric. + value: The evaluation score or result. + comment: Optional human-readable explanation of the result. + metadata: Optional structured metadata about the evaluation process. + data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN). + config_id: Optional Langfuse score config ID. Note: All arguments must be provided as keywords. Positional arguments will raise a TypeError. @@ -276,11 +277,11 @@ def __init__( """Initialize an ExperimentItemResult with the provided data. Args: - item: The original experiment item that was processed - output: The actual output produced by the task function for this item - evaluations: List of evaluation results for this item - trace_id: Optional Langfuse trace ID for this item's execution - dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset + item: The original experiment item that was processed. + output: The actual output produced by the task function for this item. + evaluations: List of evaluation results for this item. + trace_id: Optional Langfuse trace ID for this item's execution. + dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset. Note: All arguments must be provided as keywords. Positional arguments will raise a TypeError. @@ -300,14 +301,15 @@ class ExperimentResult: about the experiment execution. Attributes: - name: The name of the experiment as specified during execution - description: Optional description of the experiment's purpose or methodology + name: The name of the experiment as specified during execution. + run_name: The name of the current experiment run. + description: Optional description of the experiment's purpose or methodology. item_results: List of results from processing each individual dataset item, - containing the original item, task output, evaluations, and trace information + containing the original item, task output, evaluations, and trace information. run_evaluations: List of aggregate evaluation results computed across all items, - such as average scores, statistical summaries, or cross-item analyses - dataset_run_id: Optional ID of the dataset run in Langfuse (when using Langfuse datasets) - dataset_run_url: Optional direct URL to view the experiment results in Langfuse UI + such as average scores, statistical summaries, or cross-item analyses. + dataset_run_id: Optional ID of the dataset run in Langfuse (when using Langfuse datasets). + dataset_run_url: Optional direct URL to view the experiment results in Langfuse UI. Examples: Basic usage with local dataset: @@ -360,6 +362,7 @@ def __init__( self, *, name: str, + run_name: str, description: Optional[str], item_results: List[ExperimentItemResult], run_evaluations: List[Evaluation], @@ -369,14 +372,16 @@ def __init__( """Initialize an ExperimentResult with the provided data. Args: - name: The name of the experiment - description: Optional description of the experiment - item_results: List of results from processing individual dataset items - run_evaluations: List of aggregate evaluation results for the entire run - dataset_run_id: Optional ID of the dataset run (for Langfuse datasets) - dataset_run_url: Optional URL to view results in Langfuse UI + name: The name of the experiment. + run_name: The current experiment run name. + description: Optional description of the experiment. + item_results: List of results from processing individual dataset items. + run_evaluations: List of aggregate evaluation results for the entire run. + dataset_run_id: Optional ID of the dataset run (for Langfuse datasets). + dataset_run_url: Optional URL to view results in Langfuse UI. """ self.name = name + self.run_name = run_name self.description = description self.item_results = item_results self.run_evaluations = run_evaluations @@ -526,7 +531,8 @@ def format(self, *, include_item_results: bool = False) -> str: # Experiment overview section output += f"\\n{'โ”€' * 50}\\n" - output += f"๐Ÿ“Š {self.name}" + output += f"๐Ÿงช Experiment: {self.name}" + output += f"\n๐Ÿ“‹ Run name: {self.run_name}" if self.description: output += f" - {self.description}" diff --git a/tests/test_experiments.py b/tests/test_experiments.py index d6ec67369..168310970 100644 --- a/tests/test_experiments.py +++ b/tests/test_experiments.py @@ -179,8 +179,9 @@ def test_run_experiment_on_langfuse_dataset(): # Verify dataset run exists via API api = get_api() - runs = api.datasets.get_runs(dataset_name) - assert len(runs.data) >= 1 + dataset_run = api.datasets.get_run( + dataset_name=dataset_name, run_name=result.run_name + ) # Validate traces are correctly persisted with input/output/metadata expected_data = {"Germany": "Capital of Germany", "France": "Capital of France"} @@ -256,22 +257,15 @@ def test_run_experiment_on_langfuse_dataset(): dataset_item.input == matching_input ), f"Trace {trace_id} should correspond to dataset item with input '{matching_input}'" - # Verify dataset run contains the correct trace IDs - dataset_run = None - for run in runs.data: - if run.id == dataset_run_id: - dataset_run = run - break - assert dataset_run is not None, f"Dataset run {dataset_run_id} should exist" - assert dataset_run.name == experiment_name, "Dataset run should have correct name" + assert dataset_run.name == result.run_name, "Dataset run should have correct name" assert ( dataset_run.description == "Test on Langfuse dataset" ), "Dataset run should have correct description" # Get dataset run items to verify trace linkage dataset_run_items = api.dataset_run_items.list( - dataset_id=dataset.id, run_name=experiment_name + dataset_id=dataset.id, run_name=result.run_name ) assert len(dataset_run_items.data) == 2, "Dataset run should have 2 items" @@ -570,6 +564,7 @@ def test_run_evaluator(**kwargs): result = dataset.run_experiment( name="Score persistence test", + run_name="Score persistence test", description="Test score persistence", task=mock_task, evaluators=[test_evaluator], @@ -585,12 +580,11 @@ def test_run_evaluator(**kwargs): # Verify scores are persisted via API api = get_api() - runs = api.datasets.get_runs(dataset_name) - assert len(runs.data) >= 1 + dataset_run = api.datasets.get_run( + dataset_name=dataset_name, run_name=result.run_name + ) - # Verify the run exists with correct name - run_names = [run.name for run in runs.data] - assert "Score persistence test" in run_names + assert dataset_run.name == "Score persistence test" def test_multiple_experiments_on_same_dataset(): @@ -616,6 +610,7 @@ def test_multiple_experiments_on_same_dataset(): # Run first experiment result1 = dataset.run_experiment( name="Experiment 1", + run_name="Experiment 1", description="First experiment", task=mock_task, evaluators=[factuality_evaluator], @@ -627,6 +622,7 @@ def test_multiple_experiments_on_same_dataset(): # Run second experiment result2 = dataset.run_experiment( name="Experiment 2", + run_name="Experiment 2", description="Second experiment", task=mock_task, evaluators=[simple_evaluator], From 469166b4dd9d8a7b7a7ab0ed93c5524defb589e4 Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Tue, 16 Sep 2025 23:17:55 +0200 Subject: [PATCH 23/25] push --- tests/test_openai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_openai.py b/tests/test_openai.py index 623802e55..056e4597d 100644 --- a/tests/test_openai.py +++ b/tests/test_openai.py @@ -94,7 +94,7 @@ def test_openai_chat_completion_stream(openai): assert len(chat_content) > 0 langfuse.flush() - sleep(1) + sleep(3) generation = get_api().observations.get_many( name=generation_name, type="GENERATION" From 1c9f01208650517d221a348a90826d7e67d17c0f Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Wed, 17 Sep 2025 09:52:13 +0200 Subject: [PATCH 24/25] add docstring --- langfuse/_client/client.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py index 86085ebbc..c5941af89 100644 --- a/langfuse/_client/client.py +++ b/langfuse/_client/client.py @@ -2514,7 +2514,8 @@ def run_experiment( If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too. Returns: - ExperimentResult dictionary containing: + ExperimentResult containing: + - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset. - item_results: List of results for each processed item with outputs and evaluations - run_evaluations: List of aggregate evaluation results for the entire run - dataset_run_id: ID of the dataset run (if using Langfuse datasets) From 9e7cac693f8d900c8b0c68bda46a1bac9fce6ac7 Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Wed, 17 Sep 2025 11:07:28 +0200 Subject: [PATCH 25/25] add observationid to link calls --- langfuse/_client/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py index c5941af89..ceb29c5d3 100644 --- a/langfuse/_client/client.py +++ b/langfuse/_client/client.py @@ -101,6 +101,7 @@ ChatPromptClient, CreateDatasetItemRequest, CreateDatasetRequest, + CreateDatasetRunItemRequest, Dataset, DatasetItem, DatasetStatus, @@ -2806,8 +2807,6 @@ async def _process_experiment_item( # Link to dataset run if this is a dataset item if hasattr(item, "id") and hasattr(item, "dataset_id"): try: - from langfuse.model import CreateDatasetRunItemRequest - dataset_run_item = self.api.dataset_run_items.create( request=CreateDatasetRunItemRequest( runName=experiment_run_name, @@ -2815,6 +2814,7 @@ async def _process_experiment_item( metadata=experiment_metadata, datasetItemId=item.id, # type: ignore traceId=trace_id, + observationId=span.id, ) )