Skip to content

Commit e3220de

Browse files
committed
added test_evaluate_skips_failed_inference_results test for mixed inferences
1 parent 4e81a16 commit e3220de

File tree

1 file changed

+76
-0
lines changed

1 file changed

+76
-0
lines changed

tests/unittests/evaluation/test_local_eval_service.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,82 @@ async def test_evaluate_success(
314314
assert mock_eval_set_results_manager.save_eval_set_result.call_count == 2
315315

316316

317+
@pytest.mark.asyncio
318+
async def test_evaluate_skips_failed_inference_results(
319+
eval_service, mock_eval_sets_manager, mock_eval_set_results_manager, mocker
320+
):
321+
invocation = Invocation(
322+
user_content=genai_types.Content(
323+
parts=[genai_types.Part(text="test user content.")]
324+
),
325+
final_response=genai_types.Content(
326+
parts=[genai_types.Part(text="test final response.")]
327+
),
328+
)
329+
inference_results = [
330+
InferenceResult(
331+
app_name="test_app",
332+
eval_set_id="test_eval_set",
333+
eval_case_id="case_failure",
334+
inferences=None,
335+
session_id="session_fail",
336+
status=InferenceStatus.FAILURE,
337+
error_message="simulated failure",
338+
),
339+
InferenceResult(
340+
app_name="test_app",
341+
eval_set_id="test_eval_set",
342+
eval_case_id="case_success",
343+
inferences=[invocation.model_copy(deep=True)],
344+
session_id="session_success",
345+
status=InferenceStatus.SUCCESS,
346+
),
347+
InferenceResult(
348+
app_name="test_app",
349+
eval_set_id="test_eval_set",
350+
eval_case_id="case_unknown",
351+
inferences=[invocation.model_copy(deep=True)],
352+
session_id="session_unknown",
353+
status=InferenceStatus.UNKNOWN,
354+
),
355+
]
356+
eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
357+
evaluate_request = EvaluateRequest(
358+
inference_results=inference_results,
359+
evaluate_config=EvaluateConfig(eval_metrics=[eval_metric], parallelism=2),
360+
)
361+
362+
mock_eval_case = mocker.MagicMock(spec=EvalCase)
363+
mock_eval_case.conversation = [invocation.model_copy(deep=True)]
364+
mock_eval_case.conversation_scenario = None
365+
mock_eval_case.session_input = None
366+
mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case
367+
368+
results = []
369+
async for result in eval_service.evaluate(evaluate_request):
370+
results.append(result)
371+
372+
assert len(results) == 3
373+
results_by_case = {result.eval_id: result for result in results}
374+
375+
failure_result = results_by_case['case_failure']
376+
assert failure_result.final_eval_status == EvalStatus.NOT_EVALUATED
377+
assert failure_result.overall_eval_metric_results == []
378+
assert failure_result.eval_metric_result_per_invocation == []
379+
380+
for case_id in ['case_success', 'case_unknown']:
381+
case_result = results_by_case[case_id]
382+
assert case_result.final_eval_status == EvalStatus.PASSED
383+
assert len(case_result.overall_eval_metric_results) == 1
384+
assert (
385+
case_result.overall_eval_metric_results[0].metric_name == 'fake_metric'
386+
)
387+
assert case_result.overall_eval_metric_results[0].score == 0.9
388+
389+
assert mock_eval_sets_manager.get_eval_case.call_count == 3
390+
assert mock_eval_set_results_manager.save_eval_set_result.call_count == 3
391+
392+
317393
@pytest.mark.asyncio
318394
async def test_evaluate_eval_case_not_found(
319395
eval_service,

0 commit comments

Comments
 (0)