@@ -168,10 +168,43 @@ async def run_evaluation(inference_result):
168168 evaluate_config = evaluate_request .evaluate_config ,
169169 )
170170
171- evaluation_tasks = [
172- run_evaluation (inference_result )
173- for inference_result in evaluate_request .inference_results
174- ]
171+ evaluation_tasks = []
172+
173+ for inference_result in evaluate_request .inference_results :
174+ if inference_result .status == InferenceStatus .FAILURE :
175+ logger .warning (
176+ 'Skipping evaluation for eval case `%s` because inference failed'
177+ ' with status `%s`: %s' ,
178+ inference_result .eval_case_id ,
179+ inference_result .status ,
180+ inference_result .error_message ,
181+ )
182+ eval_case = self ._eval_sets_manager .get_eval_case (
183+ app_name = inference_result .app_name ,
184+ eval_set_id = inference_result .eval_set_id ,
185+ eval_case_id = inference_result .eval_case_id ,
186+ )
187+ if eval_case is None :
188+ raise NotFoundError (
189+ f'Eval case with id { inference_result .eval_case_id } not found'
190+ f' for app { inference_result .app_name } and eval set'
191+ f' { inference_result .eval_set_id } .'
192+ )
193+ eval_case_result = await self ._build_not_evaluated_eval_case_result (
194+ inference_result = inference_result ,
195+ eval_case = eval_case ,
196+ reason = 'Inference failed' ,
197+ )
198+ if self ._eval_set_results_manager :
199+ self ._eval_set_results_manager .save_eval_set_result (
200+ app_name = inference_result .app_name ,
201+ eval_set_id = inference_result .eval_set_id ,
202+ eval_case_results = [eval_case_result ],
203+ )
204+ yield eval_case_result
205+ continue
206+
207+ evaluation_tasks .append (run_evaluation (inference_result ))
175208
176209 for evaluation_task in asyncio .as_completed (evaluation_tasks ):
177210 inference_result , eval_case_result = await evaluation_task
@@ -226,6 +259,29 @@ async def _evaluate_single_inference_result(
226259 else 'test_user_id'
227260 )
228261
262+ if inference_result .status == InferenceStatus .FAILURE :
263+ eval_case_result = await self ._build_not_evaluated_eval_case_result (
264+ inference_result = inference_result ,
265+ eval_case = eval_case ,
266+ user_id = user_id ,
267+ reason = 'Inference status indicates failure' ,
268+ )
269+ return (inference_result , eval_case_result )
270+
271+ if inference_result .inferences is None :
272+ logger .warning (
273+ 'Inference result for eval case `%s` did not include inferences;'
274+ ' marking as not evaluated.' ,
275+ inference_result .eval_case_id ,
276+ )
277+ eval_case_result = await self ._build_not_evaluated_eval_case_result (
278+ inference_result = inference_result ,
279+ eval_case = eval_case ,
280+ user_id = user_id ,
281+ reason = 'Missing inference payload' ,
282+ )
283+ return (inference_result , eval_case_result )
284+
229285 if eval_case .conversation_scenario is None and len (
230286 inference_result .inferences
231287 ) != len (eval_case .conversation ):
@@ -389,6 +445,61 @@ def _generate_final_eval_status(
389445
390446 return final_eval_status
391447
448+ async def _build_not_evaluated_eval_case_result (
449+ self ,
450+ * ,
451+ inference_result : InferenceResult ,
452+ eval_case : EvalCase ,
453+ user_id : Optional [str ] = None ,
454+ reason : Optional [str ] = None ,
455+ ) -> EvalCaseResult :
456+ """Constructs an EvalCaseResult for cases that could not be evaluated."""
457+ resolved_user_id = user_id or self ._resolve_user_id (eval_case )
458+ session_details = await self ._safe_get_session_details (
459+ app_name = inference_result .app_name ,
460+ user_id = resolved_user_id ,
461+ session_id = inference_result .session_id ,
462+ )
463+ if reason :
464+ logger .info (
465+ 'Eval case `%s` marked as not evaluated: %s' ,
466+ inference_result .eval_case_id ,
467+ reason ,
468+ )
469+ return EvalCaseResult (
470+ eval_set_file = inference_result .eval_set_id ,
471+ eval_set_id = inference_result .eval_set_id ,
472+ eval_id = inference_result .eval_case_id ,
473+ final_eval_status = EvalStatus .NOT_EVALUATED ,
474+ overall_eval_metric_results = [],
475+ eval_metric_result_per_invocation = [],
476+ session_id = inference_result .session_id ,
477+ session_details = session_details ,
478+ user_id = resolved_user_id ,
479+ )
480+
481+ def _resolve_user_id (self , eval_case : EvalCase ) -> str :
482+ if eval_case .session_input and eval_case .session_input .user_id :
483+ return eval_case .session_input .user_id
484+ return 'test_user_id'
485+
486+ async def _safe_get_session_details (
487+ self , * , app_name : str , user_id : str , session_id : str
488+ ):
489+ try :
490+ return await self ._session_service .get_session (
491+ app_name = app_name , user_id = user_id , session_id = session_id
492+ )
493+ except NotFoundError :
494+ logger .warning (
495+ 'Session `%s` for app `%s` and user `%s` not found while building'
496+ ' eval result; continuing without session details.' ,
497+ session_id ,
498+ app_name ,
499+ user_id ,
500+ )
501+ return None
502+
392503 async def _perform_inference_single_eval_item (
393504 self ,
394505 app_name : str ,
0 commit comments