@@ -314,6 +314,82 @@ async def test_evaluate_success(
314314 assert mock_eval_set_results_manager .save_eval_set_result .call_count == 2
315315
316316
317+ @pytest .mark .asyncio
318+ async def test_evaluate_skips_failed_inference_results (
319+ eval_service , mock_eval_sets_manager , mock_eval_set_results_manager , mocker
320+ ):
321+ invocation = Invocation (
322+ user_content = genai_types .Content (
323+ parts = [genai_types .Part (text = "test user content." )]
324+ ),
325+ final_response = genai_types .Content (
326+ parts = [genai_types .Part (text = "test final response." )]
327+ ),
328+ )
329+ inference_results = [
330+ InferenceResult (
331+ app_name = "test_app" ,
332+ eval_set_id = "test_eval_set" ,
333+ eval_case_id = "case_failure" ,
334+ inferences = None ,
335+ session_id = "session_fail" ,
336+ status = InferenceStatus .FAILURE ,
337+ error_message = "simulated failure" ,
338+ ),
339+ InferenceResult (
340+ app_name = "test_app" ,
341+ eval_set_id = "test_eval_set" ,
342+ eval_case_id = "case_success" ,
343+ inferences = [invocation .model_copy (deep = True )],
344+ session_id = "session_success" ,
345+ status = InferenceStatus .SUCCESS ,
346+ ),
347+ InferenceResult (
348+ app_name = "test_app" ,
349+ eval_set_id = "test_eval_set" ,
350+ eval_case_id = "case_unknown" ,
351+ inferences = [invocation .model_copy (deep = True )],
352+ session_id = "session_unknown" ,
353+ status = InferenceStatus .UNKNOWN ,
354+ ),
355+ ]
356+ eval_metric = EvalMetric (metric_name = "fake_metric" , threshold = 0.5 )
357+ evaluate_request = EvaluateRequest (
358+ inference_results = inference_results ,
359+ evaluate_config = EvaluateConfig (eval_metrics = [eval_metric ], parallelism = 2 ),
360+ )
361+
362+ mock_eval_case = mocker .MagicMock (spec = EvalCase )
363+ mock_eval_case .conversation = [invocation .model_copy (deep = True )]
364+ mock_eval_case .conversation_scenario = None
365+ mock_eval_case .session_input = None
366+ mock_eval_sets_manager .get_eval_case .return_value = mock_eval_case
367+
368+ results = []
369+ async for result in eval_service .evaluate (evaluate_request ):
370+ results .append (result )
371+
372+ assert len (results ) == 3
373+ results_by_case = {result .eval_id : result for result in results }
374+
375+ failure_result = results_by_case ['case_failure' ]
376+ assert failure_result .final_eval_status == EvalStatus .NOT_EVALUATED
377+ assert failure_result .overall_eval_metric_results == []
378+ assert failure_result .eval_metric_result_per_invocation == []
379+
380+ for case_id in ['case_success' , 'case_unknown' ]:
381+ case_result = results_by_case [case_id ]
382+ assert case_result .final_eval_status == EvalStatus .PASSED
383+ assert len (case_result .overall_eval_metric_results ) == 1
384+ assert (
385+ case_result .overall_eval_metric_results [0 ].metric_name == 'fake_metric'
386+ )
387+ assert case_result .overall_eval_metric_results [0 ].score == 0.9
388+
389+ assert mock_eval_sets_manager .get_eval_case .call_count == 3
390+ assert mock_eval_set_results_manager .save_eval_set_result .call_count == 3
391+
392+
317393@pytest .mark .asyncio
318394async def test_evaluate_eval_case_not_found (
319395 eval_service ,
0 commit comments