From 8d00362998e9f68e06b68efe9f4acc56efac20b1 Mon Sep 17 00:00:00 2001 From: ftnext Date: Sun, 1 Feb 2026 10:59:29 +0900 Subject: [PATCH 1/4] feat(eval): Support custom metrics in AgentEvaluator --- src/google/adk/evaluation/agent_evaluator.py | 37 +++++ .../evaluation/test_agent_evaluator.py | 137 ++++++++++++++++++ 2 files changed, 174 insertions(+) create mode 100644 tests/unittests/evaluation/test_agent_evaluator.py diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py index c0fc736340..ae6323502d 100644 --- a/src/google/adk/evaluation/agent_evaluator.py +++ b/src/google/adk/evaluation/agent_evaluator.py @@ -43,6 +43,9 @@ from .eval_metrics import BaseCriterion from .eval_metrics import EvalMetric from .eval_metrics import EvalMetricResult +from .eval_metrics import Interval +from .eval_metrics import MetricInfo +from .eval_metrics import MetricValueInfo from .eval_metrics import PrebuiltMetrics from .eval_result import EvalCaseResult from .eval_set import EvalSet @@ -50,6 +53,8 @@ from .evaluator import EvalStatus from .in_memory_eval_sets_manager import InMemoryEvalSetsManager from .local_eval_sets_manager import convert_eval_set_to_pydantic_schema +from .metric_evaluator_registry import _get_default_metric_evaluator_registry +from .metric_evaluator_registry import MetricEvaluatorRegistry from .simulation.user_simulator_provider import UserSimulatorProvider logger = logging.getLogger("google_adk." + __name__) @@ -82,6 +87,19 @@ def load_json(file_path: str) -> Union[Dict, List]: return json.load(f) +def _get_default_metric_info( + metric_name: str, description: str = "" +) -> MetricInfo: + """Returns a default MetricInfo for a metric.""" + return MetricInfo( + metric_name=metric_name, + description=description, + metric_value_info=MetricValueInfo( + interval=Interval(min_value=0.0, max_value=1.0) + ), + ) + + class _EvalMetricResultWithInvocation(BaseModel): """EvalMetricResult along with both actual and expected invocation. @@ -154,6 +172,22 @@ async def evaluate_eval_set( user_simulator_config=eval_config.user_simulator_config ) + metric_evaluator_registry = _get_default_metric_evaluator_registry() + if eval_config.custom_metrics: + from .custom_metric_evaluator import _CustomMetricEvaluator + + for metric_name, config in eval_config.custom_metrics.items(): + if config.metric_info: + metric_info = config.metric_info.model_copy() + metric_info.metric_name = metric_name + else: + metric_info = _get_default_metric_info( + metric_name=metric_name, description=config.description + ) + metric_evaluator_registry.register_evaluator( + metric_info, _CustomMetricEvaluator + ) + # Step 1: Perform evals, basically inferencing and evaluation of metrics eval_results_by_eval_id = await AgentEvaluator._get_eval_results_by_eval_id( agent_for_eval=agent_for_eval, @@ -161,6 +195,7 @@ async def evaluate_eval_set( eval_metrics=eval_metrics, num_runs=num_runs, user_simulator_provider=user_simulator_provider, + metric_evaluator_registry=metric_evaluator_registry, ) # Step 2: Post-process the results! @@ -536,6 +571,7 @@ async def _get_eval_results_by_eval_id( eval_metrics: list[EvalMetric], num_runs: int, user_simulator_provider: UserSimulatorProvider, + metric_evaluator_registry: Optional[MetricEvaluatorRegistry] = None, ) -> dict[str, list[EvalCaseResult]]: """Returns EvalCaseResults grouped by eval case id. @@ -560,6 +596,7 @@ async def _get_eval_results_by_eval_id( app_name=app_name, eval_set=eval_set ), user_simulator_provider=user_simulator_provider, + metric_evaluator_registry=metric_evaluator_registry, ) inference_requests = [ diff --git a/tests/unittests/evaluation/test_agent_evaluator.py b/tests/unittests/evaluation/test_agent_evaluator.py new file mode 100644 index 0000000000..7bfb99469d --- /dev/null +++ b/tests/unittests/evaluation/test_agent_evaluator.py @@ -0,0 +1,137 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from google.adk.errors.not_found_error import NotFoundError +from google.adk.evaluation.agent_evaluator import AgentEvaluator +from google.adk.evaluation.custom_metric_evaluator import _CustomMetricEvaluator +from google.adk.evaluation.eval_config import EvalConfig +from google.adk.evaluation.eval_metrics import BaseCriterion +from google.adk.evaluation.eval_metrics import EvalMetric +from google.adk.evaluation.eval_set import EvalSet +from google.adk.evaluation.metric_evaluator_registry import MetricEvaluatorRegistry +import pytest + + +@pytest.fixture(autouse=True) +def restore_metric_registry(): + original_registry = MetricEvaluatorRegistry._registry.copy() + yield + MetricEvaluatorRegistry._registry = original_registry + + +def fake_custom_metric(*_args, **_kwargs): + return None + + +@pytest.mark.asyncio +async def test_evaluate_eval_set_registers_custom_metric(monkeypatch): + eval_config = EvalConfig( + criteria={"my_custom_metric": 0.5}, + custom_metrics={ + "my_custom_metric": { + "code_config": { + "name": ( + "tests.unittests.evaluation." + "test_agent_evaluator.fake_custom_metric" + ), + }, + }, + }, + ) + eval_set = EvalSet( + eval_set_id="eval_set", + name="eval_set", + eval_cases=[], + ) + + async def fake_get_agent_for_eval(*_args, **_kwargs): + return object() + + async def fake_get_eval_results_by_eval_id( + *_args, metric_evaluator_registry, **_kwargs + ): + eval_metric = EvalMetric( + metric_name="my_custom_metric", + threshold=0.5, + criterion=BaseCriterion(threshold=0.5), + custom_function_path=( + "tests.unittests.evaluation.test_agent_evaluator.fake_custom_metric" + ), + ) + evaluator = metric_evaluator_registry.get_evaluator(eval_metric) + assert isinstance(evaluator, _CustomMetricEvaluator) + return {} + + monkeypatch.setattr( + AgentEvaluator, "_get_agent_for_eval", fake_get_agent_for_eval + ) + monkeypatch.setattr( + AgentEvaluator, + "_get_eval_results_by_eval_id", + fake_get_eval_results_by_eval_id, + ) + + await AgentEvaluator.evaluate_eval_set( + agent_module="dummy.module", + eval_set=eval_set, + eval_config=eval_config, + num_runs=1, + print_detailed_results=False, + ) + + +@pytest.mark.asyncio +async def test_evaluate_eval_set_does_not_register_without_custom_metrics( + monkeypatch, +): + eval_config = EvalConfig(criteria={"response_match_score": 0.8}) + eval_set = EvalSet( + eval_set_id="eval_set", + name="eval_set", + eval_cases=[], + ) + + async def fake_get_agent_for_eval(*_args, **_kwargs): + return object() + + async def fake_get_eval_results_by_eval_id( + *_args, metric_evaluator_registry, **_kwargs + ): + eval_metric = EvalMetric( + metric_name="my_custom_metric", + threshold=0.5, + criterion=BaseCriterion(threshold=0.5), + ) + with pytest.raises(NotFoundError): + metric_evaluator_registry.get_evaluator(eval_metric) + return {} + + monkeypatch.setattr( + AgentEvaluator, "_get_agent_for_eval", fake_get_agent_for_eval + ) + monkeypatch.setattr( + AgentEvaluator, + "_get_eval_results_by_eval_id", + fake_get_eval_results_by_eval_id, + ) + + await AgentEvaluator.evaluate_eval_set( + agent_module="dummy.module", + eval_set=eval_set, + eval_config=eval_config, + num_runs=1, + print_detailed_results=False, + ) From 944633288c93ca26b0b79c179072904b904fbca8 Mon Sep 17 00:00:00 2001 From: ftnext Date: Sun, 1 Feb 2026 11:17:31 +0900 Subject: [PATCH 2/4] refactor(eval): Extract default metric info helper --- src/google/adk/cli/cli_eval.py | 17 +--------- src/google/adk/evaluation/agent_evaluator.py | 19 ++--------- src/google/adk/evaluation/metric_defaults.py | 32 +++++++++++++++++++ .../evaluation/test_metric_info_utils.py | 26 +++++++++++++++ 4 files changed, 61 insertions(+), 33 deletions(-) create mode 100644 src/google/adk/evaluation/metric_defaults.py create mode 100644 tests/unittests/evaluation/test_metric_info_utils.py diff --git a/src/google/adk/cli/cli_eval.py b/src/google/adk/cli/cli_eval.py index 33c1693208..571d95c5fb 100644 --- a/src/google/adk/cli/cli_eval.py +++ b/src/google/adk/cli/cli_eval.py @@ -34,11 +34,9 @@ from ..evaluation.eval_case import get_all_tool_calls from ..evaluation.eval_case import IntermediateDataType from ..evaluation.eval_metrics import EvalMetric -from ..evaluation.eval_metrics import Interval -from ..evaluation.eval_metrics import MetricInfo -from ..evaluation.eval_metrics import MetricValueInfo from ..evaluation.eval_result import EvalCaseResult from ..evaluation.eval_sets_manager import EvalSetsManager +from ..evaluation.metric_defaults import get_default_metric_info from ..utils.context_utils import Aclosing logger = logging.getLogger("google_adk." + __name__) @@ -73,19 +71,6 @@ def _get_agent_module(agent_module_file_path: str): return _import_from_path(module_name, file_path) -def get_default_metric_info( - metric_name: str, description: str = "" -) -> MetricInfo: - """Returns a default MetricInfo for a metric.""" - return MetricInfo( - metric_name=metric_name, - description=description, - metric_value_info=MetricValueInfo( - interval=Interval(min_value=0.0, max_value=1.0) - ), - ) - - def get_root_agent(agent_module_file_path: str) -> Agent: """Returns root agent given the agent module.""" agent_module = _get_agent_module(agent_module_file_path) diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py index ae6323502d..9cad502b5a 100644 --- a/src/google/adk/evaluation/agent_evaluator.py +++ b/src/google/adk/evaluation/agent_evaluator.py @@ -43,9 +43,6 @@ from .eval_metrics import BaseCriterion from .eval_metrics import EvalMetric from .eval_metrics import EvalMetricResult -from .eval_metrics import Interval -from .eval_metrics import MetricInfo -from .eval_metrics import MetricValueInfo from .eval_metrics import PrebuiltMetrics from .eval_result import EvalCaseResult from .eval_set import EvalSet @@ -53,6 +50,7 @@ from .evaluator import EvalStatus from .in_memory_eval_sets_manager import InMemoryEvalSetsManager from .local_eval_sets_manager import convert_eval_set_to_pydantic_schema +from .metric_defaults import get_default_metric_info from .metric_evaluator_registry import _get_default_metric_evaluator_registry from .metric_evaluator_registry import MetricEvaluatorRegistry from .simulation.user_simulator_provider import UserSimulatorProvider @@ -87,19 +85,6 @@ def load_json(file_path: str) -> Union[Dict, List]: return json.load(f) -def _get_default_metric_info( - metric_name: str, description: str = "" -) -> MetricInfo: - """Returns a default MetricInfo for a metric.""" - return MetricInfo( - metric_name=metric_name, - description=description, - metric_value_info=MetricValueInfo( - interval=Interval(min_value=0.0, max_value=1.0) - ), - ) - - class _EvalMetricResultWithInvocation(BaseModel): """EvalMetricResult along with both actual and expected invocation. @@ -181,7 +166,7 @@ async def evaluate_eval_set( metric_info = config.metric_info.model_copy() metric_info.metric_name = metric_name else: - metric_info = _get_default_metric_info( + metric_info = get_default_metric_info( metric_name=metric_name, description=config.description ) metric_evaluator_registry.register_evaluator( diff --git a/src/google/adk/evaluation/metric_defaults.py b/src/google/adk/evaluation/metric_defaults.py new file mode 100644 index 0000000000..acbaedca17 --- /dev/null +++ b/src/google/adk/evaluation/metric_defaults.py @@ -0,0 +1,32 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from .eval_metrics import Interval +from .eval_metrics import MetricInfo +from .eval_metrics import MetricValueInfo + + +def get_default_metric_info( + metric_name: str, description: str = "" +) -> MetricInfo: + """Returns a default MetricInfo for a metric.""" + return MetricInfo( + metric_name=metric_name, + description=description, + metric_value_info=MetricValueInfo( + interval=Interval(min_value=0.0, max_value=1.0) + ), + ) diff --git a/tests/unittests/evaluation/test_metric_info_utils.py b/tests/unittests/evaluation/test_metric_info_utils.py new file mode 100644 index 0000000000..b8d269d43b --- /dev/null +++ b/tests/unittests/evaluation/test_metric_info_utils.py @@ -0,0 +1,26 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from google.adk.evaluation.metric_defaults import get_default_metric_info + + +def test_get_default_metric_info(): + metric_info = get_default_metric_info("my_metric", "test description") + + assert metric_info.metric_name == "my_metric" + assert metric_info.description == "test description" + assert metric_info.metric_value_info.interval.min_value == 0.0 + assert metric_info.metric_value_info.interval.max_value == 1.0 From 2985224393f426dac49a073d6016a7cdd5c2b8e7 Mon Sep 17 00:00:00 2001 From: ftnext Date: Sun, 1 Feb 2026 12:14:07 +0900 Subject: [PATCH 3/4] test(integration): Add custom metric example eval --- .../test_files/custom_metrics/metrics.py | 69 +++++++++++++++++++ .../simple_custom_metric.test.json | 65 +++++++++++++++++ .../custom_metrics/test_config.json | 13 ++++ tests/integration/test_with_test_file.py | 12 ++++ 4 files changed, 159 insertions(+) create mode 100644 tests/integration/fixture/home_automation_agent/test_files/custom_metrics/metrics.py create mode 100644 tests/integration/fixture/home_automation_agent/test_files/custom_metrics/simple_custom_metric.test.json create mode 100644 tests/integration/fixture/home_automation_agent/test_files/custom_metrics/test_config.json diff --git a/tests/integration/fixture/home_automation_agent/test_files/custom_metrics/metrics.py b/tests/integration/fixture/home_automation_agent/test_files/custom_metrics/metrics.py new file mode 100644 index 0000000000..2ecbf8bd5e --- /dev/null +++ b/tests/integration/fixture/home_automation_agent/test_files/custom_metrics/metrics.py @@ -0,0 +1,69 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Optional + +from google.adk.evaluation.eval_case import ConversationScenario +from google.adk.evaluation.eval_case import get_all_tool_calls +from google.adk.evaluation.eval_case import Invocation +from google.adk.evaluation.eval_metrics import EvalMetric +from google.adk.evaluation.eval_metrics import EvalStatus +from google.adk.evaluation.evaluator import EvaluationResult +from google.adk.evaluation.evaluator import PerInvocationResult + + +def tool_trajectory_length_match( + eval_metric: EvalMetric, + actual_invocations: list[Invocation], + expected_invocations: Optional[list[Invocation]] = None, + conversation_scenario: Optional[ConversationScenario] = None, +) -> EvaluationResult: + del eval_metric + del conversation_scenario + expected_invocations = expected_invocations or [] + + per_invocation_results = [] + for idx, actual in enumerate(actual_invocations): + expected = ( + expected_invocations[idx] if idx < len(expected_invocations) else None + ) + actual_tools = get_all_tool_calls(actual.intermediate_data) + expected_tools = ( + get_all_tool_calls(expected.intermediate_data) if expected else [] + ) + match = len(actual_tools) == len(expected_tools) + per_invocation_results.append( + PerInvocationResult( + actual_invocation=actual, + expected_invocation=expected, + score=1.0 if match else 0.0, + eval_status=EvalStatus.PASSED if match else EvalStatus.FAILED, + ) + ) + + overall_score = ( + sum(r.score for r in per_invocation_results) / len(per_invocation_results) + if per_invocation_results + else 0.0 + ) + overall_eval_status = ( + EvalStatus.PASSED if overall_score == 1.0 else EvalStatus.FAILED + ) + return EvaluationResult( + overall_score=overall_score, + overall_eval_status=overall_eval_status, + per_invocation_results=per_invocation_results, + ) diff --git a/tests/integration/fixture/home_automation_agent/test_files/custom_metrics/simple_custom_metric.test.json b/tests/integration/fixture/home_automation_agent/test_files/custom_metrics/simple_custom_metric.test.json new file mode 100644 index 0000000000..42a8d51470 --- /dev/null +++ b/tests/integration/fixture/home_automation_agent/test_files/custom_metrics/simple_custom_metric.test.json @@ -0,0 +1,65 @@ +{ + "eval_set_id": "custom_metrics_eval_set", + "name": "custom_metrics_eval_set", + "description": "Custom metric evaluation sample.", + "eval_cases": [ + { + "eval_id": "tests/integration/fixture/home_automation_agent/test_files/custom_metrics/simple_custom_metric.test.json", + "conversation": [ + { + "invocation_id": "a9e4f840-7f1e-4b69-b9c1-3b85c03a60a4", + "user_content": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "Turn off device_2 in the Bedroom." + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "I have set the device_2 status to off." + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": null, + "args": { + "location": "Bedroom", + "device_id": "device_2", + "status": "OFF" + }, + "name": "set_device_info" + } + ], + "intermediate_responses": [] + }, + "creation_timestamp": 1747337309.2360144 + } + ], + "session_input": null, + "creation_timestamp": 1747337309.2360282 + } + ], + "creation_timestamp": 1747337309.2360387 +} diff --git a/tests/integration/fixture/home_automation_agent/test_files/custom_metrics/test_config.json b/tests/integration/fixture/home_automation_agent/test_files/custom_metrics/test_config.json new file mode 100644 index 0000000000..405f7910f8 --- /dev/null +++ b/tests/integration/fixture/home_automation_agent/test_files/custom_metrics/test_config.json @@ -0,0 +1,13 @@ +{ + "criteria": { + "tool_trajectory_length_match": 1.0 + }, + "custom_metrics": { + "tool_trajectory_length_match": { + "code_config": { + "name": "tests.integration.fixture.home_automation_agent.test_files.custom_metrics.metrics.tool_trajectory_length_match" + }, + "description": "Checks that actual and expected tool trajectories have the same length." + } + } +} diff --git a/tests/integration/test_with_test_file.py b/tests/integration/test_with_test_file.py index eed2a2d732..aa644d0b71 100644 --- a/tests/integration/test_with_test_file.py +++ b/tests/integration/test_with_test_file.py @@ -25,6 +25,18 @@ async def test_with_single_test_file(): ) +@pytest.mark.asyncio +async def test_with_custom_metric(): + """Test eval with a custom metric.""" + await AgentEvaluator.evaluate( + agent_module="tests.integration.fixture.home_automation_agent", + eval_dataset_file_path_or_dir=( + "tests/integration/fixture/home_automation_agent/test_files/custom_metrics/simple_custom_metric.test.json" + ), + num_runs=1, + ) + + @pytest.mark.asyncio async def test_with_folder_of_test_files_long_running(): """Test the agent's basic ability via a folder of session files.""" From 2afcec12d7d69da7a42d29983ca03faa9478cafd Mon Sep 17 00:00:00 2001 From: ftnext Date: Sun, 1 Feb 2026 12:24:25 +0900 Subject: [PATCH 4/4] fix(eval): Isolate metric evaluator registry per instance --- src/google/adk/evaluation/metric_evaluator_registry.py | 4 +++- tests/unittests/evaluation/test_agent_evaluator.py | 8 -------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/src/google/adk/evaluation/metric_evaluator_registry.py b/src/google/adk/evaluation/metric_evaluator_registry.py index 775d5c2d7a..981e11926b 100644 --- a/src/google/adk/evaluation/metric_evaluator_registry.py +++ b/src/google/adk/evaluation/metric_evaluator_registry.py @@ -47,7 +47,9 @@ class MetricEvaluatorRegistry: """A registry for metric Evaluators.""" - _registry: dict[str, tuple[type[Evaluator], MetricInfo]] = {} + def __init__(self): + """Initializes an empty registry.""" + self._registry: dict[str, tuple[type[Evaluator], MetricInfo]] = {} def get_evaluator(self, eval_metric: EvalMetric) -> Evaluator: """Returns an Evaluator for the given metric. diff --git a/tests/unittests/evaluation/test_agent_evaluator.py b/tests/unittests/evaluation/test_agent_evaluator.py index 7bfb99469d..fcd4e93b79 100644 --- a/tests/unittests/evaluation/test_agent_evaluator.py +++ b/tests/unittests/evaluation/test_agent_evaluator.py @@ -21,17 +21,9 @@ from google.adk.evaluation.eval_metrics import BaseCriterion from google.adk.evaluation.eval_metrics import EvalMetric from google.adk.evaluation.eval_set import EvalSet -from google.adk.evaluation.metric_evaluator_registry import MetricEvaluatorRegistry import pytest -@pytest.fixture(autouse=True) -def restore_metric_registry(): - original_registry = MetricEvaluatorRegistry._registry.copy() - yield - MetricEvaluatorRegistry._registry = original_registry - - def fake_custom_metric(*_args, **_kwargs): return None