googleapis
diff --git a/‎tests/unit/vertexai/genai/replays/test_get_evaluation_run.py‎
Lines changed: 118 additions & 117 deletions b/‎tests/unit/vertexai/genai/replays/test_get_evaluation_run.py‎
Lines changed: 118 additions & 117 deletions
@@ -16,31 +16,40 @@
 
 from tests.unit.vertexai.genai.replays import pytest_helper
 from vertexai import types
+from google.genai import types as genai_types
 import datetime
 import pytest
 
 
 def test_get_eval_run(client):
     """Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
+    client._api_client._http_options.base_url = (
+        "https://us-central1-autopush-aiplatform.sandbox.googleapis.com/"
+    )
+    client._api_client._http_options.api_version = "v1beta1"
     evaluation_run_name = (
-        "projects/503583131166/locations/us-central1/evaluationRuns/1957799200510967808"
+        "projects/503583131166/locations/us-central1/evaluationRuns/5133048044039700480"
     )
     evaluation_run = client.evals.get_evaluation_run(
         name=evaluation_run_name, include_evaluation_items=True
     )
-    check_run_1957799200510967808(client, evaluation_run, evaluation_run_name)
-    check_run_1957799200510967808_evaluation_item_results(
+    check_run_5133048044039700480(client, evaluation_run, evaluation_run_name)
+    check_run_5133048044039700480_evaluation_item_results(
         client, evaluation_run, evaluation_run_name
     )
 
 
 def test_get_eval_run_include_evaluation_items_false(client):
     """Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
+    client._api_client._http_options.base_url = (
+        "https://us-central1-autopush-aiplatform.sandbox.googleapis.com/"
+    )
+    client._api_client._http_options.api_version = "v1beta1"
     evaluation_run_name = (
-        "projects/503583131166/locations/us-central1/evaluationRuns/1957799200510967808"
+        "projects/503583131166/locations/us-central1/evaluationRuns/5133048044039700480"
     )
     evaluation_run = client.evals.get_evaluation_run(name=evaluation_run_name)
-    check_run_1957799200510967808(client, evaluation_run, evaluation_run_name)
+    check_run_5133048044039700480(client, evaluation_run, evaluation_run_name)
     assert evaluation_run.evaluation_item_results is None
 
 
@@ -99,158 +108,150 @@ def test_get_eval_run_eval_set_source(client):
 @pytest.mark.asyncio
 async def test_get_eval_run_async(client):
     """Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
-    eval_run_id = "1957799200510967808"
+    client._api_client._http_options.base_url = (
+        "https://us-central1-autopush-aiplatform.sandbox.googleapis.com/"
+    )
+    client._api_client._http_options.api_version = "v1beta1"
+    eval_run_id = "5133048044039700480"
     evaluation_run_name = (
         f"projects/503583131166/locations/us-central1/evaluationRuns/{eval_run_id}"
     )
     evaluation_run = await client.aio.evals.get_evaluation_run(name=eval_run_id)
-    check_run_1957799200510967808(client, evaluation_run, evaluation_run_name)
+    check_run_5133048044039700480(client, evaluation_run, evaluation_run_name)
     assert evaluation_run.evaluation_item_results is None
 
 
-def check_run_1957799200510967808(
+def check_run_5133048044039700480(
     client, evaluation_run: types.EvaluationRun, evaluation_run_name: str
 ):
     assert isinstance(evaluation_run, types.EvaluationRun)
     assert evaluation_run.name == evaluation_run_name
-    assert evaluation_run.display_name == "test2"
-    assert evaluation_run.metadata == {"pipeline_id": "4460531348888616960"}
+    assert evaluation_run.display_name == "sdk-test-1"
+    assert evaluation_run.metadata == {"pipeline_id": "4868043098678099968"}
     assert evaluation_run.create_time == datetime.datetime(
-        2025, 9, 8, 20, 55, 41, 833176, tzinfo=datetime.timezone.utc
+        2025, 10, 21, 19, 25, 58, 669441, tzinfo=datetime.timezone.utc
     )
     assert evaluation_run.completion_time == datetime.datetime(
-        2025, 9, 8, 20, 56, 13, 492971, tzinfo=datetime.timezone.utc
+        2025, 10, 21, 19, 26, 15, 855568, tzinfo=datetime.timezone.utc
     )
     assert evaluation_run.state == types.EvaluationRunState.SUCCEEDED
     assert evaluation_run.evaluation_set_snapshot == (
-        "projects/503583131166/locations/us-central1/evaluationSets/8069535738573619200"
-    )
-    assert evaluation_run.data_source.bigquery_request_set == types.BigQueryRequestSet(
-        uri="bq://lakeyk-test-limited.inference_batch_prediction_input.1317387725199900672_1b",
-        prompt_column="request",
-        candidate_response_columns={
-            "baseline_model_response": "baseline_model_response",
-            "checkpoint_1": "checkpoint_1",
-            "checkpoint_2": "checkpoint_2",
-        },
+        "projects/503583131166/locations/us-central1/evaluationSets/3122155626046685184"
     )
+    assert evaluation_run.data_source.evaluation_set == "projects/503583131166/locations/us-central1/evaluationSets/3122155626046685184"
     assert evaluation_run.evaluation_run_results.evaluation_set == (
-        "projects/503583131166/locations/us-central1/evaluationSets/102386522778501120"
+        "projects/503583131166/locations/us-central1/evaluationSets/129513673658990592"
     )
     assert evaluation_run.inference_configs == {
-        "checkpoint_1": types.EvaluationRunInferenceConfig(
-            model="projects/503583131166/locations/us-central1/endpoints/9030177948249882624"
-        ),
-        "checkpoint_2": types.EvaluationRunInferenceConfig(
-            model="projects/503583131166/locations/us-central1/endpoints/7751155654076661760"
+        "gemini-2.0-flash-001@default": types.EvaluationRunInferenceConfig(
+            agent_config=types.EvaluationRunAgentConfig(
+                developer_instruction={
+                    "parts": [
+                      {
+                        "text": "example agent developer instruction"
+                      }
+                    ]
+                },
+                tools=[
+                    genai_types.Tool(
+                        function_declarations=[
+                            genai_types.FunctionDeclaration(
+                                name="check_chime",
+                                description="Check chime.",
+                                parameters={
+                                    "type": "OBJECT",
+                                    "properties": {
+                                        "nums": {
+                                            "type": "STRING",
+                                            "description": "List of numbers to be verified."
+                                        }
+                                    },
+                                    "required": [
+                                        "nums"
+                                    ]
+                                },
+                            ),
+                        ],
+                    )
+                ],
+            )
         ),
     }
     assert evaluation_run.evaluation_run_results.summary_metrics == (
         types.SummaryMetric(
             metrics={
-                "checkpoint_1/user_defined/MODE": 5,
-                "checkpoint_2/universal/P90": 1,
-                "gemini-2.0-flash-001@default/universal/AVERAGE": 0.6943817985685249,
-                "gemini-2.0-flash-001@default/user_defined/P90": 5,
-                "gemini-2.0-flash-001@default/universal/VARIANCE": 0.03146487552180889,
-                "gemini-2.0-flash-001@default/user_defined/P95": 5,
-                "checkpoint_1/universal/MINIMUM": 0.8571428656578064,
-                "checkpoint_1/universal/VARIANCE": 0.0015452162403157982,
-                "gemini-2.0-flash-001@default/universal/STANDARD_DEVIATION": 0.17738341388587855,
-                "checkpoint_2/user_defined/P95": 5,
-                "checkpoint_2/universal/MODE": 1,
-                "checkpoint_2/user_defined/P90": 5,
-                "checkpoint_2/universal/P99": 1,
-                "gemini-2.0-flash-001@default/universal/MAXIMUM": 1,
-                "checkpoint_2/universal/P95": 1,
-                "checkpoint_2/user_defined/P99": 5,
-                "checkpoint_2/universal/MINIMUM": 0.7777777910232544,
-                "gemini-2.0-flash-001@default/universal/P90": 0.8777777791023255,
-                "checkpoint_1/universal/AVERAGE": 0.986633250587865,
-                "checkpoint_1/universal/MAXIMUM": 1,
-                "checkpoint_1/universal/STANDARD_DEVIATION": 0.0393092386127714,
-                "gemini-2.0-flash-001@default/universal/P95": 0.9000000059604645,
-                "gemini-2.0-flash-001@default/user_defined/MAXIMUM": 5,
-                "gemini-2.0-flash-001@default/user_defined/MINIMUM": 3,
-                "gemini-2.0-flash-001@default/user_defined/VARIANCE": 0.4044321329639886,
-                "checkpoint_2/user_defined/MAXIMUM": 5,
-                "checkpoint_1/universal/MEDIAN": 1,
-                "gemini-2.0-flash-001@default/universal/MEDIAN": 0.7142857313156128,
-                "gemini-2.0-flash-001@default/user_defined/AVERAGE": 4.736842105263158,
-                "gemini-2.0-flash-001@default/user_defined/MEDIAN": 5,
-                "checkpoint_2/user_defined/AVERAGE": 5,
-                "checkpoint_2/user_defined/MEDIAN": 5,
-                "checkpoint_2/user_defined/STANDARD_DEVIATION": 0,
-                "checkpoint_2/universal/MAXIMUM": 1,
-                "checkpoint_1/universal/MODE": 1,
-                "checkpoint_2/user_defined/MINIMUM": 5,
-                "checkpoint_1/user_defined/VARIANCE": 0,
-                "checkpoint_2/universal/VARIANCE": 0.005771725970062436,
-                "checkpoint_2/universal/AVERAGE": 0.9438178790243048,
-                "checkpoint_1/user_defined/MINIMUM": 5,
-                "gemini-2.0-flash-001@default/universal/P99": 0.9800000011920929,
-                "gemini-2.0-flash-001@default/universal/MINIMUM": 0.2857142984867096,
-                "checkpoint_2/user_defined/VARIANCE": 0,
-                "checkpoint_1/user_defined/MEDIAN": 5,
-                "checkpoint_2/universal/STANDARD_DEVIATION": 0.07597187617837561,
-                "checkpoint_1/user_defined/AVERAGE": 5,
-                "checkpoint_1/user_defined/MAXIMUM": 5,
-                "gemini-2.0-flash-001@default/user_defined/MODE": 5,
-                "checkpoint_1/user_defined/P95": 5,
-                "checkpoint_1/universal/P99": 1,
-                "checkpoint_1/user_defined/P90": 5,
-                "checkpoint_2/universal/MEDIAN": 1,
-                "checkpoint_1/universal/P95": 1,
-                "checkpoint_1/user_defined/STANDARD_DEVIATION": 0,
-                "gemini-2.0-flash-001@default/user_defined/STANDARD_DEVIATION": 0.6359497880839245,
-                "checkpoint_1/user_defined/P99": 5,
-                "gemini-2.0-flash-001@default/universal/MODE": [
-                    0.75,
-                    0.8571428656578064,
-                ],
-                "checkpoint_2/user_defined/MODE": 5,
-                "checkpoint_1/universal/P90": 1,
-                "gemini-2.0-flash-001@default/user_defined/P99": 5,
+                  "gemini-2.0-flash-001@default/safety_v1/VARIANCE": 0.08950617055834077,
+                  "gemini-2.0-flash-001@default/safety_v1/MAXIMUM": 1,
+                  "gemini-2.0-flash-001@default/universal/AVERAGE": 0.7888888915379842,
+                  "gemini-2.0-flash-001@default/universal/P90": 1,
+                  "gemini-2.0-flash-001@default/safety_v1/MEDIAN": 1,
+                  "gemini-2.0-flash-001@default/universal/P95": 1,
+                  "gemini-2.0-flash-001@default/universal/VARIANCE": 0.08950617055834077,
+                  "gemini-2.0-flash-001@default/universal/STANDARD_DEVIATION": 0.2991758188061675,
+                  "gemini-2.0-flash-001@default/universal/MEDIAN": 1,
+                  "gemini-2.0-flash-001@default/safety_v1/STANDARD_DEVIATION": 0.2991758188061675,
+                  "gemini-2.0-flash-001@default/universal/MODE": 1,
+                  "gemini-2.0-flash-001@default/safety_v1/MODE": 1,
+                  "gemini-2.0-flash-001@default/safety_v1/MINIMUM": 0.3333333432674408,
+                  "gemini-2.0-flash-001@default/safety_v1/P90": 1,
+                  "gemini-2.0-flash-001@default/safety_v1/P95": 1,
+                  "gemini-2.0-flash-001@default/universal/P99": 1,
+                  "gemini-2.0-flash-001@default/safety_v1/AVERAGE": 0.7888888915379842,
+                  "gemini-2.0-flash-001@default/universal/MINIMUM": 0.3333333432674408,
+                  "gemini-2.0-flash-001@default/universal/MAXIMUM": 1,
+                  "gemini-2.0-flash-001@default/safety_v1/P99": 1,
             },
-            total_items=19,
+            total_items=3,
         )
     )
     assert evaluation_run.error is None
 
 
-def check_run_1957799200510967808_evaluation_item_results(
+def check_run_5133048044039700480_evaluation_item_results(
     client, evaluation_run: types.EvaluationRun, evaluation_run_name: str
 ):
     eval_result = evaluation_run.evaluation_item_results
     assert isinstance(eval_result, types.EvaluationResult)
     assert eval_result.summary_metrics == [
         types.AggregatedMetricResult(
-            metric_name="checkpoint_1/universal",
-            mean_score=0.986633250587865,
-            stdev_score=0.0393092386127714,
-        ),
-        types.AggregatedMetricResult(
-            metric_name="checkpoint_2/universal",
-            mean_score=0.9438178790243048,
-            stdev_score=0.07597187617837561,
-        ),
-        types.AggregatedMetricResult(
-            metric_name="gemini-2.0-flash-001@default/universal",
-            mean_score=0.6943817985685249,
-            stdev_score=0.17738341388587855,
+            metric_name="safety_v1",
+            mean_score=0.7888888915379842,
+            stdev_score=0.2991758188061675,
         ),
         types.AggregatedMetricResult(
-            metric_name="checkpoint_1/user_defined", mean_score=5, stdev_score=0
-        ),
-        types.AggregatedMetricResult(
-            metric_name="checkpoint_2/user_defined", mean_score=5, stdev_score=0
-        ),
-        types.AggregatedMetricResult(
-            metric_name="gemini-2.0-flash-001@default/user_defined",
-            mean_score=4.736842105263158,
-            stdev_score=0.6359497880839245,
+            metric_name="universal",
+            mean_score=0.7888888915379842,
+            stdev_score=0.2991758188061675,
         ),
     ]
+    # Check the agent info.
+    assert eval_result.agent_info == types.AgentInfo(
+        name="gemini-2.0-flash-001@default",
+        instruction="example agent developer instruction",
+        description=None,
+        tool_declarations=[
+            genai_types.Tool(
+                function_declarations=[
+                    genai_types.FunctionDeclaration(
+                        name="check_chime",
+                        description="Check chime.",
+                        parameters={
+                            "type": "OBJECT",
+                            "properties": {
+                                "nums": {
+                                    "type": "STRING",
+                                    "description": "List of numbers to be verified."
+                                }
+                            },
+                            "required": [
+                                "nums"
+                            ]
+                        },
+                    ),
+                ],
+            )
+        ],
+    )
     # Check the first eval case result.
     eval_case_result = eval_result.eval_case_results[0]
     assert isinstance(eval_case_result, types.EvalCaseResult)
@@ -275,15 +276,15 @@ def check_run_1957799200510967808_evaluation_item_results(
                 importance="HIGH",
                 type="LANGUAGE:PRIMARY_RESPONSE_LANGUAGE",
             ),
-            reasoning=("The entire response is written in the English language."),
+            reasoning=("The response uses English words."),
             verdict=True,
         )
     )
     # Check the first evaluation dataset.
     eval_dataset = eval_result.evaluation_dataset[0]
     assert isinstance(eval_dataset, types.EvaluationDataset)
     assert eval_dataset.candidate_name == "gemini-2.0-flash-001@default"
-    assert eval_dataset.eval_dataset_df.shape == (19, 3)
+    assert eval_dataset.eval_dataset_df.shape == (3, 3)
 
 
 pytestmark = pytest_helper.setup(