Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion langfuse/_client/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2730,6 +2730,7 @@ async def process_item(item: ExperimentItem) -> ExperimentItemResult:
comment=evaluation.comment,
metadata=evaluation.metadata,
data_type=evaluation.data_type, # type: ignore
config_id=evaluation.config_id,
)

except Exception as e:
Expand Down Expand Up @@ -2856,9 +2857,11 @@ async def _process_experiment_item(
self.create_score(
trace_id=trace_id,
name=evaluation.name,
value=evaluation.value or -1,
value=evaluation.value, # type: ignore
comment=evaluation.comment,
metadata=evaluation.metadata,
config_id=evaluation.config_id,
data_type=evaluation.data_type, # type: ignore
)

except Exception as e:
Expand Down
129 changes: 129 additions & 0 deletions tests/test_experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -668,3 +668,132 @@ def test_format_experiment_results_basic():

langfuse_client.flush()
time.sleep(1)


def test_boolean_score_types():
"""Test that BOOLEAN score types are properly ingested and persisted."""
from langfuse.api import ScoreDataType

langfuse_client = get_client()

def boolean_evaluator(*, input, output, expected_output=None, **kwargs):
"""Boolean evaluator that checks if output contains the expected answer."""
if not expected_output:
return Evaluation(
name="has_expected_content",
value=False,
data_type=ScoreDataType.BOOLEAN,
comment="No expected output to check",
)

contains_expected = expected_output.lower() in str(output).lower()
return Evaluation(
name="has_expected_content",
value=contains_expected,
data_type=ScoreDataType.BOOLEAN,
comment=f"Output {'contains' if contains_expected else 'does not contain'} expected content",
)

def boolean_run_evaluator(*, item_results: List[ExperimentItemResult], **kwargs):
"""Run evaluator that returns boolean based on all items passing."""
if not item_results:
return Evaluation(
name="all_items_pass",
value=False,
data_type=ScoreDataType.BOOLEAN,
comment="No items to evaluate",
)

# Check if all boolean evaluations are True
all_pass = True
for item_result in item_results:
for evaluation in item_result.evaluations:
if (
evaluation.name == "has_expected_content"
and evaluation.value is False
):
all_pass = False
break
if not all_pass:
break

return Evaluation(
name="all_items_pass",
value=all_pass,
data_type=ScoreDataType.BOOLEAN,
comment=f"{'All' if all_pass else 'Not all'} items passed the boolean evaluation",
)

# Test data where some items should pass and some should fail
test_data = [
{"input": "What is the capital of Germany?", "expected_output": "Berlin"},
{"input": "What is the capital of France?", "expected_output": "Paris"},
{"input": "What is the capital of Spain?", "expected_output": "Madrid"},
]

# Task that returns correct answers for Germany and France, but wrong for Spain
def mock_task_with_boolean_results(*, item: ExperimentItem, **kwargs):
input_val = (
item.get("input")
if isinstance(item, dict)
else getattr(item, "input", "unknown")
)
input_str = str(input_val) if input_val is not None else ""

if "Germany" in input_str:
return "The capital is Berlin"
elif "France" in input_str:
return "The capital is Paris"
else:
return "I don't know the capital"

result = langfuse_client.run_experiment(
name="Boolean score type test",
description="Test BOOLEAN data type in scores",
data=test_data,
task=mock_task_with_boolean_results,
evaluators=[boolean_evaluator],
run_evaluators=[boolean_run_evaluator],
)

# Validate basic result structure
assert len(result.item_results) == 3
assert len(result.run_evaluations) == 1

# Validate individual item evaluations have boolean values
expected_results = [
True,
True,
False,
] # Germany and France should pass, Spain should fail
for i, item_result in enumerate(result.item_results):
assert len(item_result.evaluations) == 1
eval_result = item_result.evaluations[0]
assert eval_result.name == "has_expected_content"
assert isinstance(eval_result.value, bool)
assert eval_result.value == expected_results[i]
assert eval_result.data_type == ScoreDataType.BOOLEAN

# Validate run evaluation is boolean and should be False (not all items passed)
run_eval = result.run_evaluations[0]
assert run_eval.name == "all_items_pass"
assert isinstance(run_eval.value, bool)
assert run_eval.value is False # Spain should fail, so not all pass
assert run_eval.data_type == ScoreDataType.BOOLEAN

# Flush and wait for server processing
langfuse_client.flush()
time.sleep(3)

# Verify scores are persisted via API with correct data types
api = get_api()
for i, item_result in enumerate(result.item_results):
trace_id = item_result.trace_id
assert trace_id is not None, f"Item {i} should have a trace_id"

# Fetch trace from API to verify score persistence
trace = api.trace.get(trace_id)
assert trace is not None, f"Trace {trace_id} should exist"

for score in trace.scores:
assert score.data_type == "BOOLEAN"