From ede184e93ea5ec1ae9278bdc605fb3be1dd3ba5a Mon Sep 17 00:00:00 2001 From: Ishan Raj Singh Date: Fri, 5 Dec 2025 20:41:45 +0530 Subject: [PATCH 01/13] Fix: Use App (with plugins) for eval when available - Extend LocalEvalService to accept optional App parameter - Route evaluation through App so plugins are applied - Add _generate_inferences_from_app() to EvaluationGenerator - Update CLI eval command to load and pass App - Add helper to load App from agent module Fixes #3833 --- src/google/adk/cli/cli_tools_click.py | 43 ++++++++++- .../adk/evaluation/evaluation_generator.py | 73 ++++++++++++++++++- .../adk/evaluation/local_eval_service.py | 43 ++++++++--- 3 files changed, 145 insertions(+), 14 deletions(-) diff --git a/src/google/adk/cli/cli_tools_click.py b/src/google/adk/cli/cli_tools_click.py index 019b328483..1cf9d108b9 100644 --- a/src/google/adk/cli/cli_tools_click.py +++ b/src/google/adk/cli/cli_tools_click.py @@ -22,7 +22,11 @@ import logging import os import tempfile -from typing import Optional +from typing import Optional, TYPE_CHECKING + +if TYPE_CHECKING: + from ..apps.app import App + import click from click.core import ParameterSource @@ -279,6 +283,34 @@ def cli_run( ) ) +def _load_app_from_module(module_path: str) -> Optional['App']: + """Try to load an App instance from the agent module. + + Args: + module_path: Python module path (e.g., 'my_package.my_agent') + + Returns: + App instance if found, None otherwise + """ + try: + import importlib + module = importlib.import_module(module_path) + + # Check for 'app' attribute (most common convention) + if hasattr(module, 'app'): + from ..apps.app import App + candidate = getattr(module, 'app') + if isinstance(candidate, App): + logger.info(f"Loaded App instance from {module_path}") + return candidate + + logger.debug(f"No App instance found in {module_path}") + + except (ImportError, AttributeError) as e: + logger.debug(f"Could not load App from module {module_path}: {e}") + + return None + @main.command("eval", cls=HelpfulCommand) @click.argument( @@ -471,10 +503,19 @@ def cli_eval( ) try: + # Try to load App if available (for plugin support like ReflectAndRetryToolPlugin) + app = _load_app_from_module(agent_module_file_path) + + if app: + logger.info("Using App instance for evaluation (plugins will be applied)") + else: + logger.info("No App found, using root_agent directly") + eval_service = LocalEvalService( root_agent=root_agent, eval_sets_manager=eval_sets_manager, eval_set_results_manager=eval_set_results_manager, + app=app, # NEW: Pass app if available ) inference_results = asyncio.run( diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py index 7f1c94f133..8a25f9404f 100644 --- a/src/google/adk/evaluation/evaluation_generator.py +++ b/src/google/adk/evaluation/evaluation_generator.py @@ -15,8 +15,11 @@ from __future__ import annotations import importlib -from typing import Any -from typing import Optional +from typing import Any, Optional, TYPE_CHECKING + +if TYPE_CHECKING: + from ..apps.app import App + import uuid from pydantic import BaseModel @@ -220,6 +223,71 @@ async def _generate_inferences_from_root_agent( ) return response_invocations + + @staticmethod + async def _generate_inferences_from_app( + invocations: list['Invocation'], + app: 'App', + initial_session: Optional['SessionInput'], + session_id: str, + session_service: 'BaseSessionService', + artifact_service: 'BaseArtifactService', + ) -> list['Invocation']: + """Generate inferences by invoking through App (preserving plugins).""" + + actual_invocations = [] + + # Determine user_id consistently + user_id = 'test_user_id' + if initial_session and initial_session.user_id is not None: + user_id = initial_session.user_id + + # Initialize session if provided + if initial_session: + app_name = initial_session.app_name if initial_session.app_name else app.name + await session_service.create_session( + app_name=app_name, + user_id=user_id, + session_id=session_id, + state=initial_session.state if initial_session.state else {}, + ) + + # Run each invocation through the app + for expected_invocation in invocations: + user_content = expected_invocation.user_content + + # Invoke through App (this applies all plugins) + response = await app.run( + user_id=user_id, + session_id=session_id, + new_message=user_content, + ) + + # Extract response similar to existing implementation + final_response = None + tool_uses = [] + invocation_id = "" + + async for event in response: + invocation_id = invocation_id or event.invocation_id + + if event.is_final_response() and event.content and event.content.parts: + final_response = event.content + elif event.get_function_calls(): + for call in event.get_function_calls(): + tool_uses.append(call) + + actual_invocations.append( + Invocation( + invocation_id=invocation_id, + user_content=user_content, + final_response=final_response, + intermediate_data=IntermediateData(tool_uses=tool_uses), + ) + ) + + return actual_invocations + @staticmethod def _process_query_with_session(session_data, data): @@ -259,3 +327,4 @@ def _process_query_with_session(session_data, data): responses[index]["actual_tool_use"] = actual_tool_uses responses[index]["response"] = response return responses + diff --git a/src/google/adk/evaluation/local_eval_service.py b/src/google/adk/evaluation/local_eval_service.py index fa50f70d23..af4ca1c5b7 100644 --- a/src/google/adk/evaluation/local_eval_service.py +++ b/src/google/adk/evaluation/local_eval_service.py @@ -20,6 +20,10 @@ from typing import AsyncGenerator from typing import Callable from typing import Optional +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from ..apps.app import App import uuid from typing_extensions import override @@ -38,6 +42,7 @@ from .base_eval_service import InferenceResult from .base_eval_service import InferenceStatus from .eval_case import Invocation +from .eval_case import SessionInput from .eval_metrics import EvalMetric from .eval_metrics import EvalMetricResult from .eval_metrics import EvalMetricResultPerInvocation @@ -73,9 +78,11 @@ def __init__( artifact_service: Optional[BaseArtifactService] = None, eval_set_results_manager: Optional[EvalSetResultsManager] = None, session_id_supplier: Callable[[], str] = _get_session_id, + app: Optional['App'] = None, ): self._root_agent = root_agent self._eval_sets_manager = eval_sets_manager + self._app = app metric_evaluator_registry = ( metric_evaluator_registry or DEFAULT_METRIC_EVALUATOR_REGISTRY ) @@ -364,23 +371,37 @@ async def _perform_inference_sigle_eval_item( ) try: - inferences = ( - await EvaluationGenerator._generate_inferences_from_root_agent( - invocations=eval_case.conversation, - root_agent=root_agent, - initial_session=initial_session, - session_id=session_id, - session_service=self._session_service, - artifact_service=self._artifact_service, + # Use App if available (so plugins like ReflectAndRetryToolPlugin run) + if self._app is not None: + inferences = ( + await EvaluationGenerator._generate_inferences_from_app( + invocations=eval_case.conversation, + app=self._app, + initial_session=initial_session, + session_id=session_id, + session_service=self._session_service, + artifact_service=self._artifact_service, + ) + ) + else: + # Fallback to direct root_agent usage (existing behavior) + inferences = ( + await EvaluationGenerator._generate_inferences_from_root_agent( + invocations=eval_case.conversation, + root_agent=root_agent, + initial_session=initial_session, + session_id=session_id, + session_service=self._session_service, + artifact_service=self._artifact_service, + ) ) - ) inference_result.inferences = inferences inference_result.status = InferenceStatus.SUCCESS return inference_result except Exception as e: - # We intentionally catch the Exception as we don't failures to affect + # We intentionally catch the Exception as we don't want failures to affect # other inferences. logger.error( 'Inference failed for eval case `%s` with error %s', @@ -389,4 +410,4 @@ async def _perform_inference_sigle_eval_item( ) inference_result.status = InferenceStatus.FAILURE inference_result.error_message = str(e) - return inference_result + return inference_result \ No newline at end of file From cf77b850fd5f020e2eb0c86c7a29712777cda7c0 Mon Sep 17 00:00:00 2001 From: Ishan Raj Singh Date: Fri, 5 Dec 2025 23:00:13 +0530 Subject: [PATCH 02/13] Fix critical issues from code review - Add AsyncGenerator to imports - Fix _generate_inferences_from_app to use Runner correctly - Add memory_service parameter - Capture both tool_uses and tool_responses - Fix return statement variable name --- .../adk/evaluation/evaluation_generator.py | 32 +++++++++++++------ .../adk/evaluation/local_eval_service.py | 1 + 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py index 12da5c63c3..30d6d5ac46 100644 --- a/src/google/adk/evaluation/evaluation_generator.py +++ b/src/google/adk/evaluation/evaluation_generator.py @@ -16,7 +16,7 @@ import copy import importlib -from typing import Any, Optional, TYPE_CHECKING +from typing import Any, AsyncGenerator, Optional, TYPE_CHECKING if TYPE_CHECKING: from ..apps.app import App @@ -41,6 +41,7 @@ from .app_details import AgentDetails from .app_details import AppDetails from .eval_case import EvalCase +from .eval_case import IntermediateData from .eval_case import Invocation from .eval_case import InvocationEvent from .eval_case import InvocationEvents @@ -326,7 +327,7 @@ def convert_events_to_eval_invocations( ) ) - return response_invocations + return invocations @staticmethod async def _generate_inferences_from_app( @@ -336,8 +337,10 @@ async def _generate_inferences_from_app( session_id: str, session_service: 'BaseSessionService', artifact_service: 'BaseArtifactService', + memory_service: 'BaseMemoryService', ) -> list['Invocation']: """Generate inferences by invoking through App (preserving plugins).""" + from ..runners import Runner actual_invocations = [] @@ -356,12 +359,20 @@ async def _generate_inferences_from_app( state=initial_session.state if initial_session.state else {}, ) + # Create Runner with App to preserve plugins + runner = Runner( + app=app, + session_service=session_service, + artifact_service=artifact_service, + memory_service=memory_service, + ) + # Run each invocation through the app for expected_invocation in invocations: user_content = expected_invocation.user_content - # Invoke through App (this applies all plugins) - response = await app.run( + # Invoke through Runner (this applies all plugins) + response = runner.run_async( user_id=user_id, session_id=session_id, new_message=user_content, @@ -370,6 +381,7 @@ async def _generate_inferences_from_app( # Extract response similar to existing implementation final_response = None tool_uses = [] + tool_responses = [] invocation_id = "" async for event in response: @@ -377,22 +389,24 @@ async def _generate_inferences_from_app( if event.is_final_response() and event.content and event.content.parts: final_response = event.content - elif event.get_function_calls(): - for call in event.get_function_calls(): - tool_uses.append(call) + elif calls := event.get_function_calls(): + tool_uses.extend(calls) + elif responses := event.get_function_responses(): + tool_responses.extend(responses) actual_invocations.append( Invocation( invocation_id=invocation_id, user_content=user_content, final_response=final_response, - intermediate_data=IntermediateData(tool_uses=tool_uses), + intermediate_data=IntermediateData( + tool_uses=tool_uses, tool_responses=tool_responses + ), ) ) return actual_invocations - return invocations @staticmethod def _get_app_details_by_invocation_id( diff --git a/src/google/adk/evaluation/local_eval_service.py b/src/google/adk/evaluation/local_eval_service.py index 1bc54de3b2..43643a12f9 100644 --- a/src/google/adk/evaluation/local_eval_service.py +++ b/src/google/adk/evaluation/local_eval_service.py @@ -423,6 +423,7 @@ async def _perform_inference_single_eval_item( session_id=session_id, session_service=self._session_service, artifact_service=self._artifact_service, + memory_service=self._memory_service, ) ) else: From 231152831803ad695d1fdd58b4e9f1e9d2023c31 Mon Sep 17 00:00:00 2001 From: Ishan Raj Singh Date: Fri, 5 Dec 2025 23:43:00 +0530 Subject: [PATCH 03/13] feat: Support App plugins in evaluation framework - Add _generate_inferences_from_app to preserve App-level plugins during evals - Enables ReflectAndRetryToolPlugin and other plugins to run in evaluation mode - Add early exit for failed inferences to prevent crashes - Maintain backward compatibility with direct root_agent usage - Properly label eval requests with EVAL_CLIENT_LABEL for telemetry Tested: Plugins load and execute during evals, failed inferences handled gracefully --- .../adk/evaluation/evaluation_generator.py | 77 +++++++++---------- .../adk/evaluation/local_eval_service.py | 45 ++++++++--- 2 files changed, 68 insertions(+), 54 deletions(-) diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py index 30d6d5ac46..1c7cd8c9bd 100644 --- a/src/google/adk/evaluation/evaluation_generator.py +++ b/src/google/adk/evaluation/evaluation_generator.py @@ -331,8 +331,9 @@ def convert_events_to_eval_invocations( @staticmethod async def _generate_inferences_from_app( - invocations: list['Invocation'], app: 'App', + root_agent: 'Agent', + user_simulator: 'UserSimulator', initial_session: Optional['SessionInput'], session_id: str, session_service: 'BaseSessionService', @@ -340,9 +341,6 @@ async def _generate_inferences_from_app( memory_service: 'BaseMemoryService', ) -> list['Invocation']: """Generate inferences by invoking through App (preserving plugins).""" - from ..runners import Runner - - actual_invocations = [] # Determine user_id consistently user_id = 'test_user_id' @@ -359,53 +357,48 @@ async def _generate_inferences_from_app( state=initial_session.state if initial_session.state else {}, ) + # Create plugins to track requests (needed for app_details) + request_intercepter_plugin = _RequestIntercepterPlugin( + name="request_intercepter_plugin" + ) + ensure_retry_options_plugin = EnsureRetryOptionsPlugin( + name="ensure_retry_options" + ) + # Create Runner with App to preserve plugins - runner = Runner( + async with Runner( app=app, session_service=session_service, artifact_service=artifact_service, memory_service=memory_service, - ) - - # Run each invocation through the app - for expected_invocation in invocations: - user_content = expected_invocation.user_content + plugins=[request_intercepter_plugin, ensure_retry_options_plugin], + ) as runner: + events = [] - # Invoke through Runner (this applies all plugins) - response = runner.run_async( - user_id=user_id, - session_id=session_id, - new_message=user_content, - ) - - # Extract response similar to existing implementation - final_response = None - tool_uses = [] - tool_responses = [] - invocation_id = "" - - async for event in response: - invocation_id = invocation_id or event.invocation_id - - if event.is_final_response() and event.content and event.content.parts: - final_response = event.content - elif calls := event.get_function_calls(): - tool_uses.extend(calls) - elif responses := event.get_function_responses(): - tool_responses.extend(responses) + # Loop through user simulator messages (handles both static and dynamic) + while True: + next_user_message = await user_simulator.get_next_user_message( + copy.deepcopy(events) + ) + if next_user_message.status == UserSimulatorStatus.SUCCESS: + async for event in EvaluationGenerator._generate_inferences_for_single_user_invocation( + runner, user_id, session_id, next_user_message.user_message + ): + events.append(event) + else: # no more messages + break - actual_invocations.append( - Invocation( - invocation_id=invocation_id, - user_content=user_content, - final_response=final_response, - intermediate_data=IntermediateData( - tool_uses=tool_uses, tool_responses=tool_responses - ), + # Extract app details from intercepted requests + app_details_by_invocation_id = ( + EvaluationGenerator._get_app_details_by_invocation_id( + events, request_intercepter_plugin ) ) - - return actual_invocations + + # Convert events to invocations + return EvaluationGenerator.convert_events_to_eval_invocations( + events, app_details_by_invocation_id + ) @staticmethod diff --git a/src/google/adk/evaluation/local_eval_service.py b/src/google/adk/evaluation/local_eval_service.py index 43643a12f9..1198ccf145 100644 --- a/src/google/adk/evaluation/local_eval_service.py +++ b/src/google/adk/evaluation/local_eval_service.py @@ -193,8 +193,8 @@ async def run_evaluation(inference_result): yield eval_case_result async def _evaluate_single_inference_result( - self, inference_result: InferenceResult, evaluate_config: EvaluateConfig - ) -> tuple[InferenceResult, EvalCaseResult]: + self, inference_result: InferenceResult, evaluate_config: EvaluateConfig +) -> tuple[InferenceResult, EvalCaseResult]: """Returns the inference result and its corresponding EvalCaseResult. A single inference result can have multiple invocations. For each @@ -203,6 +203,24 @@ async def _evaluate_single_inference_result( The EvalCaseResult contains scores for each metric per invocation and the overall score. """ + # Handle failed inferences early - skip evaluation + if ( + inference_result.status == InferenceStatus.FAILURE + or inference_result.inferences is None + ): + eval_case_result = EvalCaseResult( + eval_set_file=inference_result.eval_set_id, + eval_set_id=inference_result.eval_set_id, + eval_id=inference_result.eval_case_id, + final_eval_status=EvalStatus.NOT_EVALUATED, + overall_eval_metric_results=[], + eval_metric_result_per_invocation=[], + session_id=inference_result.session_id, + session_details=None, + user_id='test_user_id', + ) + return (inference_result, eval_case_result) + eval_case = self._eval_sets_manager.get_eval_case( app_name=inference_result.app_name, eval_set_id=inference_result.eval_set_id, @@ -415,17 +433,19 @@ async def _perform_inference_single_eval_item( try: # Use App if available (so plugins like ReflectAndRetryToolPlugin run) if self._app is not None: - inferences = ( - await EvaluationGenerator._generate_inferences_from_app( - invocations=eval_case.conversation, - app=self._app, - initial_session=initial_session, - session_id=session_id, - session_service=self._session_service, - artifact_service=self._artifact_service, - memory_service=self._memory_service, + with client_label_context(EVAL_CLIENT_LABEL): # ← ADD THIS + inferences = ( + await EvaluationGenerator._generate_inferences_from_app( + app=self._app, + root_agent=root_agent, + user_simulator=self._user_simulator_provider.provide(eval_case), + initial_session=initial_session, + session_id=session_id, + session_service=self._session_service, + artifact_service=self._artifact_service, + memory_service=self._memory_service, + ) ) - ) else: # Fallback to direct root_agent usage (existing behavior) with client_label_context(EVAL_CLIENT_LABEL): @@ -441,6 +461,7 @@ async def _perform_inference_single_eval_item( ) ) + inference_result.inferences = inferences inference_result.status = InferenceStatus.SUCCESS From de93f9f0081b80f20ba9e96b4fecc1f6004807ca Mon Sep 17 00:00:00 2001 From: Ishan Raj Singh Date: Sat, 6 Dec 2025 00:03:53 +0530 Subject: [PATCH 04/13] feat: Support App plugins in evaluation framework - Add _generate_inferences_from_app to preserve App-level plugins during evals - Enables ReflectAndRetryToolPlugin and other plugins to run in evaluation mode - Add early exit for failed inferences to prevent crashes - Maintain backward compatibility with direct root_agent usage - Properly label eval requests with EVAL_CLIENT_LABEL for telemetry - Remove unused root_agent parameter from _generate_inferences_from_app - Deduplicate client_label_context wrapper Addresses Gemini Code Assist feedback for improved code clarity and maintainability. --- .../adk/evaluation/evaluation_generator.py | 1 - .../adk/evaluation/local_eval_service.py | 54 +++++++++---------- 2 files changed, 26 insertions(+), 29 deletions(-) diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py index 1c7cd8c9bd..e1ab33ecb5 100644 --- a/src/google/adk/evaluation/evaluation_generator.py +++ b/src/google/adk/evaluation/evaluation_generator.py @@ -332,7 +332,6 @@ def convert_events_to_eval_invocations( @staticmethod async def _generate_inferences_from_app( app: 'App', - root_agent: 'Agent', user_simulator: 'UserSimulator', initial_session: Optional['SessionInput'], session_id: str, diff --git a/src/google/adk/evaluation/local_eval_service.py b/src/google/adk/evaluation/local_eval_service.py index 1198ccf145..355e8b6079 100644 --- a/src/google/adk/evaluation/local_eval_service.py +++ b/src/google/adk/evaluation/local_eval_service.py @@ -432,34 +432,32 @@ async def _perform_inference_single_eval_item( try: # Use App if available (so plugins like ReflectAndRetryToolPlugin run) - if self._app is not None: - with client_label_context(EVAL_CLIENT_LABEL): # ← ADD THIS - inferences = ( - await EvaluationGenerator._generate_inferences_from_app( - app=self._app, - root_agent=root_agent, - user_simulator=self._user_simulator_provider.provide(eval_case), - initial_session=initial_session, - session_id=session_id, - session_service=self._session_service, - artifact_service=self._artifact_service, - memory_service=self._memory_service, - ) - ) - else: - # Fallback to direct root_agent usage (existing behavior) - with client_label_context(EVAL_CLIENT_LABEL): - inferences = ( - await EvaluationGenerator._generate_inferences_from_root_agent( - root_agent=root_agent, - user_simulator=self._user_simulator_provider.provide(eval_case), - initial_session=initial_session, - session_id=session_id, - session_service=self._session_service, - artifact_service=self._artifact_service, - memory_service=self._memory_service, - ) - ) + with client_label_context(EVAL_CLIENT_LABEL): + if self._app is not None: + inferences = ( + await EvaluationGenerator._generate_inferences_from_app( + app=self._app, + user_simulator=self._user_simulator_provider.provide(eval_case), + initial_session=initial_session, + session_id=session_id, + session_service=self._session_service, + artifact_service=self._artifact_service, + memory_service=self._memory_service, + ) + ) + else: + # Fallback to direct root_agent usage (existing behavior) + inferences = ( + await EvaluationGenerator._generate_inferences_from_root_agent( + root_agent=root_agent, + user_simulator=self._user_simulator_provider.provide(eval_case), + initial_session=initial_session, + session_id=session_id, + session_service=self._session_service, + artifact_service=self._artifact_service, + memory_service=self._memory_service, + ) + ) inference_result.inferences = inferences From 884b8dfba0c724bd6e1f6b4f7a40301871e56952 Mon Sep 17 00:00:00 2001 From: ISHAN RAJ SINGH Date: Sat, 6 Dec 2025 00:07:07 +0530 Subject: [PATCH 05/13] Update src/google/adk/evaluation/evaluation_generator.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- src/google/adk/evaluation/evaluation_generator.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py index e1ab33ecb5..7c150442d8 100644 --- a/src/google/adk/evaluation/evaluation_generator.py +++ b/src/google/adk/evaluation/evaluation_generator.py @@ -364,13 +364,16 @@ async def _generate_inferences_from_app( name="ensure_retry_options" ) - # Create Runner with App to preserve plugins + # Create a copy of the app to avoid mutating the original object and add eval-specific plugins. + app_for_runner = app.model_copy(deep=True) + app_for_runner.plugins.extend([request_intercepter_plugin, ensure_retry_options_plugin]) + + # Create Runner with the modified App to preserve plugins async with Runner( - app=app, + app=app_for_runner, session_service=session_service, artifact_service=artifact_service, memory_service=memory_service, - plugins=[request_intercepter_plugin, ensure_retry_options_plugin], ) as runner: events = [] From 1380758253dfa2926ec96d1a5fc8f3413e5c5db6 Mon Sep 17 00:00:00 2001 From: Ishan Raj Singh Date: Sat, 6 Dec 2025 00:12:49 +0530 Subject: [PATCH 06/13] feat: Support App plugins in evaluation framework - Add _generate_inferences_from_app to preserve App-level plugins during evals - Enables ReflectAndRetryToolPlugin and other plugins to run in evaluation mode - Add early exit for failed inferences to prevent crashes - Maintain backward compatibility with direct root_agent usage - Remove unused root_agent parameter from _generate_inferences_from_app - Deduplicate client_label_context wrapper - Refactor common arguments to reduce code duplication - Deep copy app before adding eval plugins to avoid mutation Addresses all Gemini Code Assist feedback plus additional safety improvements. --- .../adk/evaluation/local_eval_service.py | 38 +++++++++---------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/src/google/adk/evaluation/local_eval_service.py b/src/google/adk/evaluation/local_eval_service.py index 355e8b6079..7e4b60ebbb 100644 --- a/src/google/adk/evaluation/local_eval_service.py +++ b/src/google/adk/evaluation/local_eval_service.py @@ -433,37 +433,33 @@ async def _perform_inference_single_eval_item( try: # Use App if available (so plugins like ReflectAndRetryToolPlugin run) with client_label_context(EVAL_CLIENT_LABEL): + # Extract common arguments to reduce duplication + common_args = { + "user_simulator": self._user_simulator_provider.provide(eval_case), + "initial_session": initial_session, + "session_id": session_id, + "session_service": self._session_service, + "artifact_service": self._artifact_service, + "memory_service": self._memory_service, + } + if self._app is not None: - inferences = ( - await EvaluationGenerator._generate_inferences_from_app( - app=self._app, - user_simulator=self._user_simulator_provider.provide(eval_case), - initial_session=initial_session, - session_id=session_id, - session_service=self._session_service, - artifact_service=self._artifact_service, - memory_service=self._memory_service, - ) + inferences = await EvaluationGenerator._generate_inferences_from_app( + app=self._app, + **common_args ) else: # Fallback to direct root_agent usage (existing behavior) - inferences = ( - await EvaluationGenerator._generate_inferences_from_root_agent( - root_agent=root_agent, - user_simulator=self._user_simulator_provider.provide(eval_case), - initial_session=initial_session, - session_id=session_id, - session_service=self._session_service, - artifact_service=self._artifact_service, - memory_service=self._memory_service, - ) + inferences = await EvaluationGenerator._generate_inferences_from_root_agent( + root_agent=root_agent, + **common_args ) - inference_result.inferences = inferences inference_result.status = InferenceStatus.SUCCESS return inference_result + except Exception as e: # We intentionally catch the Exception as we don't want failures to affect # other inferences. From 9bd14330d9816cb318404bd63419d8792e3cab70 Mon Sep 17 00:00:00 2001 From: Ishan Raj Singh Date: Sat, 6 Dec 2025 00:19:50 +0530 Subject: [PATCH 07/13] feat: Support App plugins in evaluation framework - Add _generate_inferences_from_app to preserve App-level plugins during evals - Enables ReflectAndRetryToolPlugin and other plugins to run in evaluation mode - Add early exit for failed inferences to prevent crashes - Maintain backward compatibility with direct root_agent usage - Remove unused root_agent parameter from _generate_inferences_from_app - Deduplicate client_label_context wrapper - Refactor common arguments to reduce code duplication - Deep copy app before adding eval plugins to avoid mutation Addresses all Gemini Code Assist feedback plus additional safety improvements. --- .../adk/evaluation/evaluation_generator.py | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py index 7c150442d8..fc2a896809 100644 --- a/src/google/adk/evaluation/evaluation_generator.py +++ b/src/google/adk/evaluation/evaluation_generator.py @@ -342,19 +342,16 @@ async def _generate_inferences_from_app( """Generate inferences by invoking through App (preserving plugins).""" # Determine user_id consistently - user_id = 'test_user_id' - if initial_session and initial_session.user_id is not None: - user_id = initial_session.user_id + user_id = initial_session.user_id if initial_session else 'test_user_id' - # Initialize session if provided - if initial_session: - app_name = initial_session.app_name if initial_session.app_name else app.name - await session_service.create_session( - app_name=app_name, - user_id=user_id, - session_id=session_id, - state=initial_session.state if initial_session.state else {}, - ) + # Initialize session (always, not just when initial_session provided) + app_name = initial_session.app_name if initial_session else app.name + await session_service.create_session( + app_name=app_name, + user_id=user_id, + session_id=session_id, + state=initial_session.state if initial_session else {}, + ) # Create plugins to track requests (needed for app_details) request_intercepter_plugin = _RequestIntercepterPlugin( @@ -403,6 +400,7 @@ async def _generate_inferences_from_app( ) + @staticmethod def _get_app_details_by_invocation_id( events: list[Event], request_intercepter: _RequestIntercepterPlugin From 379c9bdf5461ba2cf649ffc3c9eac33f2efd9234 Mon Sep 17 00:00:00 2001 From: Ishan Raj Singh Date: Sat, 6 Dec 2025 00:31:52 +0530 Subject: [PATCH 08/13] refactor: Address Gemini Code Assist feedback - Extract common simulation logic into _run_user_simulation_loop helper - Eliminates ~30 lines of code duplication - Single source of truth for user simulation workflow - Used by both _generate_inferences_from_app and _generate_inferences_from_root_agent - Move imports to top of _load_app_from_module for PEP 8 compliance - Improves readability and makes dependencies clear upfront - Fix critical bug: Always create session in _generate_inferences_from_app - Previously failed when EvalCase had no session_input - Now consistent with _generate_inferences_from_root_agent - Simplify user_id determination logic with ternary operators Changes reduce code by ~30 lines while improving maintainability. --- src/google/adk/cli/cli_tools_click.py | 6 +- .../adk/evaluation/evaluation_generator.py | 185 ++++++++++-------- 2 files changed, 104 insertions(+), 87 deletions(-) diff --git a/src/google/adk/cli/cli_tools_click.py b/src/google/adk/cli/cli_tools_click.py index c9227908f5..707cc01c94 100644 --- a/src/google/adk/cli/cli_tools_click.py +++ b/src/google/adk/cli/cli_tools_click.py @@ -527,13 +527,14 @@ def _load_app_from_module(module_path: str) -> Optional['App']: Returns: App instance if found, None otherwise """ + import importlib + from ..apps.app import App + try: - import importlib module = importlib.import_module(module_path) # Check for 'app' attribute (most common convention) if hasattr(module, 'app'): - from ..apps.app import App candidate = getattr(module, 'app') if isinstance(candidate, App): logger.info(f"Loaded App instance from {module_path}") @@ -547,6 +548,7 @@ def _load_app_from_module(module_path: str) -> Optional['App']: return None + def eval_options(): """Decorator to add common eval options to click commands.""" diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py index fc2a896809..104b972ea4 100644 --- a/src/google/adk/evaluation/evaluation_generator.py +++ b/src/google/adk/evaluation/evaluation_generator.py @@ -158,6 +158,54 @@ async def _process_query( reset_func=reset_func, initial_session=initial_session, ) + + @staticmethod + async def _run_user_simulation_loop( + runner: Runner, + user_id: str, + session_id: str, + user_simulator: UserSimulator, + request_intercepter_plugin: _RequestIntercepterPlugin, + ) -> list[Invocation]: + """Run the user simulation loop and return invocations. + + Args: + runner: Configured Runner instance + user_id: User identifier + session_id: Session identifier + user_simulator: User simulator to generate messages + request_intercepter_plugin: Plugin to intercept requests for app_details + + Returns: + List of Invocation objects from the simulation + """ + events = [] + + # Loop through user simulator messages (handles both static and dynamic) + while True: + next_user_message = await user_simulator.get_next_user_message( + copy.deepcopy(events) + ) + if next_user_message.status == UserSimulatorStatus.SUCCESS: + async for event in EvaluationGenerator._generate_inferences_for_single_user_invocation( + runner, user_id, session_id, next_user_message.user_message + ): + events.append(event) + else: # no more messages + break + + # Extract app details from intercepted requests + app_details_by_invocation_id = ( + EvaluationGenerator._get_app_details_by_invocation_id( + events, request_intercepter_plugin + ) + ) + + # Convert events to invocations + return EvaluationGenerator.convert_events_to_eval_invocations( + events, app_details_by_invocation_id + ) + @staticmethod async def _generate_inferences_for_single_user_invocation( @@ -198,74 +246,59 @@ async def _generate_inferences_from_root_agent( artifact_service: Optional[BaseArtifactService] = None, memory_service: Optional[BaseMemoryService] = None, ) -> list[Invocation]: - """Scrapes the root agent in coordination with the user simulator.""" + """Scrapes the root agent in coordination with the user simulator.""" - if not session_service: - session_service = InMemorySessionService() + if not session_service: + session_service = InMemorySessionService() - if not memory_service: - memory_service = InMemoryMemoryService() + if not memory_service: + memory_service = InMemoryMemoryService() - app_name = ( - initial_session.app_name if initial_session else "EvaluationGenerator" - ) - user_id = initial_session.user_id if initial_session else "test_user_id" - session_id = session_id if session_id else str(uuid.uuid4()) - - _ = await session_service.create_session( - app_name=app_name, - user_id=user_id, - state=initial_session.state if initial_session else {}, - session_id=session_id, - ) + app_name = ( + initial_session.app_name if initial_session else "EvaluationGenerator" + ) + user_id = initial_session.user_id if initial_session else "test_user_id" + session_id = session_id if session_id else str(uuid.uuid4()) - if not artifact_service: - artifact_service = InMemoryArtifactService() + _ = await session_service.create_session( + app_name=app_name, + user_id=user_id, + state=initial_session.state if initial_session else {}, + session_id=session_id, + ) - # Reset agent state for each query - if callable(reset_func): - reset_func() + if not artifact_service: + artifact_service = InMemoryArtifactService() - request_intercepter_plugin = _RequestIntercepterPlugin( - name="request_intercepter_plugin" - ) - # We ensure that there is some kind of retries on the llm_requests that are - # generated from the Agent. This is done to make inferencing step of evals - # more resilient to temporary model failures. - ensure_retry_options_plugin = EnsureRetryOptionsPlugin( - name="ensure_retry_options" - ) - async with Runner( - app_name=app_name, - agent=root_agent, - artifact_service=artifact_service, - session_service=session_service, - memory_service=memory_service, - plugins=[request_intercepter_plugin, ensure_retry_options_plugin], - ) as runner: - events = [] - while True: - next_user_message = await user_simulator.get_next_user_message( - copy.deepcopy(events) - ) - if next_user_message.status == UserSimulatorStatus.SUCCESS: - async for ( - event - ) in EvaluationGenerator._generate_inferences_for_single_user_invocation( - runner, user_id, session_id, next_user_message.user_message - ): - events.append(event) - else: # no message generated - break + # Reset agent state for each query + if callable(reset_func): + reset_func() - app_details_by_invocation_id = ( - EvaluationGenerator._get_app_details_by_invocation_id( - events, request_intercepter_plugin - ) + request_intercepter_plugin = _RequestIntercepterPlugin( + name="request_intercepter_plugin" ) - return EvaluationGenerator.convert_events_to_eval_invocations( - events, app_details_by_invocation_id + # We ensure that there is some kind of retries on the llm_requests that are + # generated from the Agent. This is done to make inferencing step of evals + # more resilient to temporary model failures. + ensure_retry_options_plugin = EnsureRetryOptionsPlugin( + name="ensure_retry_options" ) + async with Runner( + app_name=app_name, + agent=root_agent, + artifact_service=artifact_service, + session_service=session_service, + memory_service=memory_service, + plugins=[request_intercepter_plugin, ensure_retry_options_plugin], + ) as runner: + return await EvaluationGenerator._run_user_simulation_loop( + runner=runner, + user_id=user_id, + session_id=session_id, + user_simulator=user_simulator, + request_intercepter_plugin=request_intercepter_plugin, + ) + @staticmethod def convert_events_to_eval_invocations( @@ -344,7 +377,7 @@ async def _generate_inferences_from_app( # Determine user_id consistently user_id = initial_session.user_id if initial_session else 'test_user_id' - # Initialize session (always, not just when initial_session provided) + # Initialize session app_name = initial_session.app_name if initial_session else app.name await session_service.create_session( app_name=app_name, @@ -372,35 +405,17 @@ async def _generate_inferences_from_app( artifact_service=artifact_service, memory_service=memory_service, ) as runner: - events = [] - - # Loop through user simulator messages (handles both static and dynamic) - while True: - next_user_message = await user_simulator.get_next_user_message( - copy.deepcopy(events) - ) - if next_user_message.status == UserSimulatorStatus.SUCCESS: - async for event in EvaluationGenerator._generate_inferences_for_single_user_invocation( - runner, user_id, session_id, next_user_message.user_message - ): - events.append(event) - else: # no more messages - break - - # Extract app details from intercepted requests - app_details_by_invocation_id = ( - EvaluationGenerator._get_app_details_by_invocation_id( - events, request_intercepter_plugin - ) - ) - - # Convert events to invocations - return EvaluationGenerator.convert_events_to_eval_invocations( - events, app_details_by_invocation_id + return await EvaluationGenerator._run_user_simulation_loop( + runner=runner, + user_id=user_id, + session_id=session_id, + user_simulator=user_simulator, + request_intercepter_plugin=request_intercepter_plugin, ) + @staticmethod def _get_app_details_by_invocation_id( events: list[Event], request_intercepter: _RequestIntercepterPlugin From 3c1c7a4ed72326353db1cb6f525ea9f161eaa9ce Mon Sep 17 00:00:00 2001 From: ISHAN RAJ SINGH Date: Sat, 6 Dec 2025 00:36:17 +0530 Subject: [PATCH 09/13] Update src/google/adk/evaluation/evaluation_generator.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- src/google/adk/evaluation/evaluation_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py index 104b972ea4..4d46991355 100644 --- a/src/google/adk/evaluation/evaluation_generator.py +++ b/src/google/adk/evaluation/evaluation_generator.py @@ -184,7 +184,7 @@ async def _run_user_simulation_loop( # Loop through user simulator messages (handles both static and dynamic) while True: next_user_message = await user_simulator.get_next_user_message( - copy.deepcopy(events) + events ) if next_user_message.status == UserSimulatorStatus.SUCCESS: async for event in EvaluationGenerator._generate_inferences_for_single_user_invocation( From dae85d35c597f9825cc7e4db4b81425c47afe8e4 Mon Sep 17 00:00:00 2001 From: ISHAN RAJ SINGH Date: Sat, 6 Dec 2025 00:37:02 +0530 Subject: [PATCH 10/13] Update src/google/adk/evaluation/evaluation_generator.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> From d03e414825652d8db26af2c4406caf411bed9487 Mon Sep 17 00:00:00 2001 From: Ishan Raj Singh Date: Sat, 6 Dec 2025 00:42:31 +0530 Subject: [PATCH 11/13] refactor: Address Gemini Code Assist feedback - Extract common simulation logic into _run_user_simulation_loop helper - Eliminates ~30 lines of code duplication - Single source of truth for user simulation workflow - Used by both _generate_inferences_from_app and _generate_inferences_from_root_agent - Move imports to top of _load_app_from_module for PEP 8 compliance - Improves readability and makes dependencies clear upfront - Fix critical bug: Always create session in _generate_inferences_from_app - Previously failed when EvalCase had no session_input - Now consistent with _generate_inferences_from_root_agent - Simplify user_id determination logic with ternary operators Changes reduce code by ~30 lines while improving maintainability. --- src/google/adk/evaluation/evaluation_generator.py | 7 ++++++- src/google/adk/evaluation/local_eval_service.py | 13 ++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py index 4d46991355..bd152ef872 100644 --- a/src/google/adk/evaluation/evaluation_generator.py +++ b/src/google/adk/evaluation/evaluation_generator.py @@ -396,7 +396,12 @@ async def _generate_inferences_from_app( # Create a copy of the app to avoid mutating the original object and add eval-specific plugins. app_for_runner = app.model_copy(deep=True) - app_for_runner.plugins.extend([request_intercepter_plugin, ensure_retry_options_plugin]) + # Add eval-specific plugins, ensuring no duplicates. + existing_plugin_names = {p.name for p in app_for_runner.plugins} + if request_intercepter_plugin.name not in existing_plugin_names: + app_for_runner.plugins.append(request_intercepter_plugin) + if ensure_retry_options_plugin.name not in existing_plugin_names: + app_for_runner.plugins.append(ensure_retry_options_plugin) # Create Runner with the modified App to preserve plugins async with Runner( diff --git a/src/google/adk/evaluation/local_eval_service.py b/src/google/adk/evaluation/local_eval_service.py index 7e4b60ebbb..b3a78e0f24 100644 --- a/src/google/adk/evaluation/local_eval_service.py +++ b/src/google/adk/evaluation/local_eval_service.py @@ -208,6 +208,17 @@ async def _evaluate_single_inference_result( inference_result.status == InferenceStatus.FAILURE or inference_result.inferences is None ): + # We still need to fetch eval_case to get the correct user_id. + eval_case = self._eval_sets_manager.get_eval_case( + app_name=inference_result.app_name, + eval_set_id=inference_result.eval_set_id, + eval_case_id=inference_result.eval_case_id, + ) + user_id = ( + eval_case.session_input.user_id + if eval_case and eval_case.session_input and eval_case.session_input.user_id + else 'test_user_id' + ) eval_case_result = EvalCaseResult( eval_set_file=inference_result.eval_set_id, eval_set_id=inference_result.eval_set_id, @@ -217,7 +228,7 @@ async def _evaluate_single_inference_result( eval_metric_result_per_invocation=[], session_id=inference_result.session_id, session_details=None, - user_id='test_user_id', + user_id=user_id, ) return (inference_result, eval_case_result) From c1babc65930bb8f4c8251eb4ad0fac73efe205c1 Mon Sep 17 00:00:00 2001 From: ISHAN RAJ SINGH Date: Sat, 6 Dec 2025 00:45:12 +0530 Subject: [PATCH 12/13] Update src/google/adk/evaluation/evaluation_generator.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- src/google/adk/evaluation/evaluation_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py index bd152ef872..815c6d6880 100644 --- a/src/google/adk/evaluation/evaluation_generator.py +++ b/src/google/adk/evaluation/evaluation_generator.py @@ -184,7 +184,7 @@ async def _run_user_simulation_loop( # Loop through user simulator messages (handles both static and dynamic) while True: next_user_message = await user_simulator.get_next_user_message( - events + copy.deepcopy(events) ) if next_user_message.status == UserSimulatorStatus.SUCCESS: async for event in EvaluationGenerator._generate_inferences_for_single_user_invocation( From 7909beed44ecf6e88f048ed548c40b75ef9567fd Mon Sep 17 00:00:00 2001 From: ISHAN RAJ SINGH Date: Sat, 6 Dec 2025 11:22:53 +0530 Subject: [PATCH 13/13] Update src/google/adk/cli/cli_tools_click.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- src/google/adk/cli/cli_tools_click.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/google/adk/cli/cli_tools_click.py b/src/google/adk/cli/cli_tools_click.py index 707cc01c94..480f2a4ee5 100644 --- a/src/google/adk/cli/cli_tools_click.py +++ b/src/google/adk/cli/cli_tools_click.py @@ -533,11 +533,10 @@ def _load_app_from_module(module_path: str) -> Optional['App']: try: module = importlib.import_module(module_path) - # Check for 'app' attribute (most common convention) - if hasattr(module, 'app'): - candidate = getattr(module, 'app') + # Find the first attribute that is an instance of App + for name, candidate in inspect.getmembers(module): if isinstance(candidate, App): - logger.info(f"Loaded App instance from {module_path}") + logger.info(f"Loaded App instance '{name}' from {module_path}") return candidate logger.debug(f"No App instance found in {module_path}")