google · ftnext · Feb 1, 2026 · Feb 1, 2026 · Feb 1, 2026 · Feb 1, 2026
diff --git a/src/google/adk/cli/cli_eval.py b/src/google/adk/cli/cli_eval.py
@@ -34,11 +34,9 @@
 from ..evaluation.eval_case import get_all_tool_calls
 from ..evaluation.eval_case import IntermediateDataType
 from ..evaluation.eval_metrics import EvalMetric
-from ..evaluation.eval_metrics import Interval
-from ..evaluation.eval_metrics import MetricInfo
-from ..evaluation.eval_metrics import MetricValueInfo
 from ..evaluation.eval_result import EvalCaseResult
 from ..evaluation.eval_sets_manager import EvalSetsManager
+from ..evaluation.metric_defaults import get_default_metric_info
 from ..utils.context_utils import Aclosing
 
 logger = logging.getLogger("google_adk." + __name__)
@@ -73,19 +71,6 @@ def _get_agent_module(agent_module_file_path: str):
   return _import_from_path(module_name, file_path)
 
 
-def get_default_metric_info(
-    metric_name: str, description: str = ""
-) -> MetricInfo:
-  """Returns a default MetricInfo for a metric."""
-  return MetricInfo(
-      metric_name=metric_name,
-      description=description,
-      metric_value_info=MetricValueInfo(
-          interval=Interval(min_value=0.0, max_value=1.0)
-      ),
-  )
-
-
 def get_root_agent(agent_module_file_path: str) -> Agent:
   """Returns root agent given the agent module."""
   agent_module = _get_agent_module(agent_module_file_path)

diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py
@@ -50,6 +50,9 @@
 from .evaluator import EvalStatus
 from .in_memory_eval_sets_manager import InMemoryEvalSetsManager
 from .local_eval_sets_manager import convert_eval_set_to_pydantic_schema
+from .metric_defaults import get_default_metric_info
+from .metric_evaluator_registry import _get_default_metric_evaluator_registry
+from .metric_evaluator_registry import MetricEvaluatorRegistry
-from .metric_defaults import get_default_metric_info
-from .metric_evaluator_registry import _get_default_metric_evaluator_registry
-from .metric_evaluator_registry import MetricEvaluatorRegistry
+from .custom_metric_evaluator import _CustomMetricEvaluator
+from .metric_defaults import get_default_metric_info
+from .metric_evaluator_registry import _get_default_metric_evaluator_registry
+from .metric_evaluator_registry import MetricEvaluatorRegistry
-from .metric_defaults import get_default_metric_info
-from .metric_evaluator_registry import _get_default_metric_evaluator_registry
-from .metric_evaluator_registry import MetricEvaluatorRegistry
+from .custom_metric_evaluator import _CustomMetricEvaluator
+from .metric_defaults import get_default_metric_info
+from .metric_evaluator_registry import _get_default_metric_evaluator_registry
+from .metric_evaluator_registry import MetricEvaluatorRegistry
 from .simulation.user_simulator_provider import UserSimulatorProvider
 
 logger = logging.getLogger("google_adk." + __name__)
@@ -154,13 +157,30 @@ async def evaluate_eval_set(
         user_simulator_config=eval_config.user_simulator_config
     )
 
+    metric_evaluator_registry = _get_default_metric_evaluator_registry()
+    if eval_config.custom_metrics:
+      from .custom_metric_evaluator import _CustomMetricEvaluator
+
+      for metric_name, config in eval_config.custom_metrics.items():
+        if config.metric_info:
+          metric_info = config.metric_info.model_copy()
+          metric_info.metric_name = metric_name
+        else:
+          metric_info = get_default_metric_info(
+              metric_name=metric_name, description=config.description
+          )
+        metric_evaluator_registry.register_evaluator(
+            metric_info, _CustomMetricEvaluator
+        )
+
     # Step 1: Perform evals, basically inferencing and evaluation of metrics
     eval_results_by_eval_id = await AgentEvaluator._get_eval_results_by_eval_id(
         agent_for_eval=agent_for_eval,
         eval_set=eval_set,
         eval_metrics=eval_metrics,
         num_runs=num_runs,
         user_simulator_provider=user_simulator_provider,
+        metric_evaluator_registry=metric_evaluator_registry,
     )
 
     # Step 2: Post-process the results!
@@ -536,6 +556,7 @@ async def _get_eval_results_by_eval_id(
       eval_metrics: list[EvalMetric],
       num_runs: int,
       user_simulator_provider: UserSimulatorProvider,
+      metric_evaluator_registry: Optional[MetricEvaluatorRegistry] = None,
   ) -> dict[str, list[EvalCaseResult]]:
     """Returns EvalCaseResults grouped by eval case id.
 
@@ -560,6 +581,7 @@ async def _get_eval_results_by_eval_id(
             app_name=app_name, eval_set=eval_set
         ),
         user_simulator_provider=user_simulator_provider,
+        metric_evaluator_registry=metric_evaluator_registry,
     )
 
     inference_requests = [

diff --git a/src/google/adk/evaluation/metric_defaults.py b/src/google/adk/evaluation/metric_defaults.py
@@ -0,0 +1,32 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from .eval_metrics import Interval
+from .eval_metrics import MetricInfo
+from .eval_metrics import MetricValueInfo
+
+
+def get_default_metric_info(
+    metric_name: str, description: str = ""
+) -> MetricInfo:
+  """Returns a default MetricInfo for a metric."""
+  return MetricInfo(
+      metric_name=metric_name,
+      description=description,
+      metric_value_info=MetricValueInfo(
+          interval=Interval(min_value=0.0, max_value=1.0)
+      ),
+  )
diff --git a/src/google/adk/evaluation/metric_evaluator_registry.py b/src/google/adk/evaluation/metric_evaluator_registry.py
@@ -47,7 +47,9 @@
 class MetricEvaluatorRegistry:
   """A registry for metric Evaluators."""
 
-  _registry: dict[str, tuple[type[Evaluator], MetricInfo]] = {}
+  def __init__(self):
+    """Initializes an empty registry."""
+    self._registry: dict[str, tuple[type[Evaluator], MetricInfo]] = {}
 
   def get_evaluator(self, eval_metric: EvalMetric) -> Evaluator:
     """Returns an Evaluator for the given metric.

diff --git a/tests/integration/fixture/home_automation_agent/test_files/custom_metrics/metrics.py b/tests/integration/fixture/home_automation_agent/test_files/custom_metrics/metrics.py
@@ -0,0 +1,69 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import Optional
+
+from google.adk.evaluation.eval_case import ConversationScenario
+from google.adk.evaluation.eval_case import get_all_tool_calls
+from google.adk.evaluation.eval_case import Invocation
+from google.adk.evaluation.eval_metrics import EvalMetric
+from google.adk.evaluation.eval_metrics import EvalStatus
+from google.adk.evaluation.evaluator import EvaluationResult
+from google.adk.evaluation.evaluator import PerInvocationResult
+
+
+def tool_trajectory_length_match(
+    eval_metric: EvalMetric,
+    actual_invocations: list[Invocation],
+    expected_invocations: Optional[list[Invocation]] = None,
+    conversation_scenario: Optional[ConversationScenario] = None,
+) -> EvaluationResult:
+  del eval_metric
+  del conversation_scenario
+  expected_invocations = expected_invocations or []
+
+  per_invocation_results = []
+  for idx, actual in enumerate(actual_invocations):
+    expected = (
+        expected_invocations[idx] if idx < len(expected_invocations) else None
+    )
+    actual_tools = get_all_tool_calls(actual.intermediate_data)
+    expected_tools = (
+        get_all_tool_calls(expected.intermediate_data) if expected else []
+    )
+    match = len(actual_tools) == len(expected_tools)
+    per_invocation_results.append(
+        PerInvocationResult(
+            actual_invocation=actual,
+            expected_invocation=expected,
+            score=1.0 if match else 0.0,
+            eval_status=EvalStatus.PASSED if match else EvalStatus.FAILED,
+        )
+    )
+
+  overall_score = (
+      sum(r.score for r in per_invocation_results) / len(per_invocation_results)
+      if per_invocation_results
+      else 0.0
+  )
+  overall_eval_status = (
+      EvalStatus.PASSED if overall_score == 1.0 else EvalStatus.FAILED
+  )
+  return EvaluationResult(
+      overall_score=overall_score,
+      overall_eval_status=overall_eval_status,
+      per_invocation_results=per_invocation_results,
+  )
diff --git a/...on/fixture/home_automation_agent/test_files/custom_metrics/simple_custom_metric.test.json b/...on/fixture/home_automation_agent/test_files/custom_metrics/simple_custom_metric.test.json
@@ -0,0 +1,65 @@
+{
+  "eval_set_id": "custom_metrics_eval_set",
+  "name": "custom_metrics_eval_set",
+  "description": "Custom metric evaluation sample.",
+  "eval_cases": [
+    {
+      "eval_id": "tests/integration/fixture/home_automation_agent/test_files/custom_metrics/simple_custom_metric.test.json",
+      "conversation": [
+        {
+          "invocation_id": "a9e4f840-7f1e-4b69-b9c1-3b85c03a60a4",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Turn off device_2 in the Bedroom."
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "I have set the device_2 status to off."
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
+                  "location": "Bedroom",
+                  "device_id": "device_2",
+                  "status": "OFF"
+                },
+                "name": "set_device_info"
+              }
+            ],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747337309.2360144
+        }
+      ],
+      "session_input": null,
+      "creation_timestamp": 1747337309.2360282
+    }
+  ],
+  "creation_timestamp": 1747337309.2360387
+}
diff --git a/tests/integration/fixture/home_automation_agent/test_files/custom_metrics/test_config.json b/tests/integration/fixture/home_automation_agent/test_files/custom_metrics/test_config.json
@@ -0,0 +1,13 @@
+{
+  "criteria": {
+    "tool_trajectory_length_match": 1.0
+  },
+  "custom_metrics": {
+    "tool_trajectory_length_match": {
+      "code_config": {
+        "name": "tests.integration.fixture.home_automation_agent.test_files.custom_metrics.metrics.tool_trajectory_length_match"
+      },
+      "description": "Checks that actual and expected tool trajectories have the same length."
+    }
+  }
+}
diff --git a/tests/integration/test_with_test_file.py b/tests/integration/test_with_test_file.py
@@ -25,6 +25,18 @@ async def test_with_single_test_file():
   )
 
 
+@pytest.mark.asyncio
+async def test_with_custom_metric():
+  """Test eval with a custom metric."""
+  await AgentEvaluator.evaluate(
+      agent_module="tests.integration.fixture.home_automation_agent",
+      eval_dataset_file_path_or_dir=(
+          "tests/integration/fixture/home_automation_agent/test_files/custom_metrics/simple_custom_metric.test.json"
+      ),
+      num_runs=1,
+  )
+
+
 @pytest.mark.asyncio
 async def test_with_folder_of_test_files_long_running():
   """Test the agent's basic ability via a folder of session files."""