Skip to content

Commit a470c6a

Browse files
committed
Handle None inferences in eval results for issue #2729
1 parent f273517 commit a470c6a

File tree

2 files changed

+159
-4
lines changed

2 files changed

+159
-4
lines changed

src/google/adk/evaluation/local_eval_service.py

Lines changed: 115 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -168,10 +168,43 @@ async def run_evaluation(inference_result):
168168
evaluate_config=evaluate_request.evaluate_config,
169169
)
170170

171-
evaluation_tasks = [
172-
run_evaluation(inference_result)
173-
for inference_result in evaluate_request.inference_results
174-
]
171+
evaluation_tasks = []
172+
173+
for inference_result in evaluate_request.inference_results:
174+
if inference_result.status == InferenceStatus.FAILURE:
175+
logger.warning(
176+
'Skipping evaluation for eval case `%s` because inference failed'
177+
' with status `%s`: %s',
178+
inference_result.eval_case_id,
179+
inference_result.status,
180+
inference_result.error_message,
181+
)
182+
eval_case = self._eval_sets_manager.get_eval_case(
183+
app_name=inference_result.app_name,
184+
eval_set_id=inference_result.eval_set_id,
185+
eval_case_id=inference_result.eval_case_id,
186+
)
187+
if eval_case is None:
188+
raise NotFoundError(
189+
f'Eval case with id {inference_result.eval_case_id} not found'
190+
f' for app {inference_result.app_name} and eval set'
191+
f' {inference_result.eval_set_id}.'
192+
)
193+
eval_case_result = await self._build_not_evaluated_eval_case_result(
194+
inference_result=inference_result,
195+
eval_case=eval_case,
196+
reason='Inference failed',
197+
)
198+
if self._eval_set_results_manager:
199+
self._eval_set_results_manager.save_eval_set_result(
200+
app_name=inference_result.app_name,
201+
eval_set_id=inference_result.eval_set_id,
202+
eval_case_results=[eval_case_result],
203+
)
204+
yield eval_case_result
205+
continue
206+
207+
evaluation_tasks.append(run_evaluation(inference_result))
175208

176209
for evaluation_task in asyncio.as_completed(evaluation_tasks):
177210
inference_result, eval_case_result = await evaluation_task
@@ -226,6 +259,29 @@ async def _evaluate_single_inference_result(
226259
else 'test_user_id'
227260
)
228261

262+
if inference_result.status == InferenceStatus.FAILURE:
263+
eval_case_result = await self._build_not_evaluated_eval_case_result(
264+
inference_result=inference_result,
265+
eval_case=eval_case,
266+
user_id=user_id,
267+
reason='Inference status indicates failure',
268+
)
269+
return (inference_result, eval_case_result)
270+
271+
if inference_result.inferences is None:
272+
logger.warning(
273+
'Inference result for eval case `%s` did not include inferences;'
274+
' marking as not evaluated.',
275+
inference_result.eval_case_id,
276+
)
277+
eval_case_result = await self._build_not_evaluated_eval_case_result(
278+
inference_result=inference_result,
279+
eval_case=eval_case,
280+
user_id=user_id,
281+
reason='Missing inference payload',
282+
)
283+
return (inference_result, eval_case_result)
284+
229285
if eval_case.conversation_scenario is None and len(
230286
inference_result.inferences
231287
) != len(eval_case.conversation):
@@ -389,6 +445,61 @@ def _generate_final_eval_status(
389445

390446
return final_eval_status
391447

448+
async def _build_not_evaluated_eval_case_result(
449+
self,
450+
*,
451+
inference_result: InferenceResult,
452+
eval_case: EvalCase,
453+
user_id: Optional[str] = None,
454+
reason: Optional[str] = None,
455+
) -> EvalCaseResult:
456+
"""Constructs an EvalCaseResult for cases that could not be evaluated."""
457+
resolved_user_id = user_id or self._resolve_user_id(eval_case)
458+
session_details = await self._safe_get_session_details(
459+
app_name=inference_result.app_name,
460+
user_id=resolved_user_id,
461+
session_id=inference_result.session_id,
462+
)
463+
if reason:
464+
logger.info(
465+
'Eval case `%s` marked as not evaluated: %s',
466+
inference_result.eval_case_id,
467+
reason,
468+
)
469+
return EvalCaseResult(
470+
eval_set_file=inference_result.eval_set_id,
471+
eval_set_id=inference_result.eval_set_id,
472+
eval_id=inference_result.eval_case_id,
473+
final_eval_status=EvalStatus.NOT_EVALUATED,
474+
overall_eval_metric_results=[],
475+
eval_metric_result_per_invocation=[],
476+
session_id=inference_result.session_id,
477+
session_details=session_details,
478+
user_id=resolved_user_id,
479+
)
480+
481+
def _resolve_user_id(self, eval_case: EvalCase) -> str:
482+
if eval_case.session_input and eval_case.session_input.user_id:
483+
return eval_case.session_input.user_id
484+
return 'test_user_id'
485+
486+
async def _safe_get_session_details(
487+
self, *, app_name: str, user_id: str, session_id: str
488+
):
489+
try:
490+
return await self._session_service.get_session(
491+
app_name=app_name, user_id=user_id, session_id=session_id
492+
)
493+
except NotFoundError:
494+
logger.warning(
495+
'Session `%s` for app `%s` and user `%s` not found while building'
496+
' eval result; continuing without session details.',
497+
session_id,
498+
app_name,
499+
user_id,
500+
)
501+
return None
502+
392503
async def _perform_inference_single_eval_item(
393504
self,
394505
app_name: str,

tests/unittests/evaluation/test_local_eval_service.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,50 @@ async def test_evaluate_single_inference_result(
407407
assert metric_result.eval_status == EvalStatus.PASSED
408408

409409

410+
@pytest.mark.asyncio
411+
async def test_evaluate_single_inference_result_handles_failed_inference(
412+
eval_service, mock_eval_sets_manager, mocker
413+
):
414+
invocation = Invocation(
415+
user_content=genai_types.Content(
416+
parts=[genai_types.Part(text="test user content.")]
417+
),
418+
final_response=genai_types.Content(
419+
parts=[genai_types.Part(text="test final response.")]
420+
),
421+
)
422+
inference_result = InferenceResult(
423+
app_name="test_app",
424+
eval_set_id="test_eval_set",
425+
eval_case_id="case1",
426+
inferences=None,
427+
session_id="session1",
428+
status=InferenceStatus.FAILURE,
429+
error_message="simulated inference failure",
430+
)
431+
eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
432+
evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1)
433+
434+
mock_eval_case = mocker.MagicMock(spec=EvalCase)
435+
mock_eval_case.conversation = [invocation.model_copy(deep=True)]
436+
mock_eval_case.conversation_scenario = None
437+
mock_eval_case.session_input = None
438+
mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case
439+
440+
_, result = await eval_service._evaluate_single_inference_result(
441+
inference_result=inference_result, evaluate_config=evaluate_config
442+
)
443+
444+
assert isinstance(result, EvalCaseResult)
445+
assert result.eval_id == "case1"
446+
assert result.final_eval_status == EvalStatus.NOT_EVALUATED
447+
assert result.overall_eval_metric_results == []
448+
assert result.eval_metric_result_per_invocation == []
449+
mock_eval_sets_manager.get_eval_case.assert_called_once_with(
450+
app_name="test_app", eval_set_id="test_eval_set", eval_case_id="case1"
451+
)
452+
453+
410454
@pytest.mark.asyncio
411455
async def test_evaluate_single_inference_result_for_conversation_scenario(
412456
eval_service, mock_eval_sets_manager, mocker

0 commit comments

Comments
 (0)