1616
1717from tests .unit .vertexai .genai .replays import pytest_helper
1818from vertexai import types
19+ from google .genai import types as genai_types
1920import datetime
2021import pytest
2122
2223
2324def test_get_eval_run (client ):
2425 """Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
26+ client ._api_client ._http_options .base_url = (
27+ "https://us-central1-autopush-aiplatform.sandbox.googleapis.com/"
28+ )
29+ client ._api_client ._http_options .api_version = "v1beta1"
2530 evaluation_run_name = (
26- "projects/503583131166/locations/us-central1/evaluationRuns/1957799200510967808 "
31+ "projects/503583131166/locations/us-central1/evaluationRuns/5133048044039700480 "
2732 )
2833 evaluation_run = client .evals .get_evaluation_run (
2934 name = evaluation_run_name , include_evaluation_items = True
3035 )
31- check_run_1957799200510967808 (client , evaluation_run , evaluation_run_name )
32- check_run_1957799200510967808_evaluation_item_results (
36+ check_run_5133048044039700480 (client , evaluation_run , evaluation_run_name )
37+ check_run_5133048044039700480_evaluation_item_results (
3338 client , evaluation_run , evaluation_run_name
3439 )
3540
3641
3742def test_get_eval_run_include_evaluation_items_false (client ):
3843 """Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
44+ client ._api_client ._http_options .base_url = (
45+ "https://us-central1-autopush-aiplatform.sandbox.googleapis.com/"
46+ )
47+ client ._api_client ._http_options .api_version = "v1beta1"
3948 evaluation_run_name = (
40- "projects/503583131166/locations/us-central1/evaluationRuns/1957799200510967808 "
49+ "projects/503583131166/locations/us-central1/evaluationRuns/5133048044039700480 "
4150 )
4251 evaluation_run = client .evals .get_evaluation_run (name = evaluation_run_name )
43- check_run_1957799200510967808 (client , evaluation_run , evaluation_run_name )
52+ check_run_5133048044039700480 (client , evaluation_run , evaluation_run_name )
4453 assert evaluation_run .evaluation_item_results is None
4554
4655
@@ -99,158 +108,150 @@ def test_get_eval_run_eval_set_source(client):
99108@pytest .mark .asyncio
100109async def test_get_eval_run_async (client ):
101110 """Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
102- eval_run_id = "1957799200510967808"
111+ client ._api_client ._http_options .base_url = (
112+ "https://us-central1-autopush-aiplatform.sandbox.googleapis.com/"
113+ )
114+ client ._api_client ._http_options .api_version = "v1beta1"
115+ eval_run_id = "5133048044039700480"
103116 evaluation_run_name = (
104117 f"projects/503583131166/locations/us-central1/evaluationRuns/{ eval_run_id } "
105118 )
106119 evaluation_run = await client .aio .evals .get_evaluation_run (name = eval_run_id )
107- check_run_1957799200510967808 (client , evaluation_run , evaluation_run_name )
120+ check_run_5133048044039700480 (client , evaluation_run , evaluation_run_name )
108121 assert evaluation_run .evaluation_item_results is None
109122
110123
111- def check_run_1957799200510967808 (
124+ def check_run_5133048044039700480 (
112125 client , evaluation_run : types .EvaluationRun , evaluation_run_name : str
113126):
114127 assert isinstance (evaluation_run , types .EvaluationRun )
115128 assert evaluation_run .name == evaluation_run_name
116- assert evaluation_run .display_name == "test2 "
117- assert evaluation_run .metadata == {"pipeline_id" : "4460531348888616960 " }
129+ assert evaluation_run .display_name == "sdk-test-1 "
130+ assert evaluation_run .metadata == {"pipeline_id" : "4868043098678099968 " }
118131 assert evaluation_run .create_time == datetime .datetime (
119- 2025 , 9 , 8 , 20 , 55 , 41 , 833176 , tzinfo = datetime .timezone .utc
132+ 2025 , 10 , 21 , 19 , 25 , 58 , 669441 , tzinfo = datetime .timezone .utc
120133 )
121134 assert evaluation_run .completion_time == datetime .datetime (
122- 2025 , 9 , 8 , 20 , 56 , 13 , 492971 , tzinfo = datetime .timezone .utc
135+ 2025 , 10 , 21 , 19 , 26 , 15 , 855568 , tzinfo = datetime .timezone .utc
123136 )
124137 assert evaluation_run .state == types .EvaluationRunState .SUCCEEDED
125138 assert evaluation_run .evaluation_set_snapshot == (
126- "projects/503583131166/locations/us-central1/evaluationSets/8069535738573619200"
127- )
128- assert evaluation_run .data_source .bigquery_request_set == types .BigQueryRequestSet (
129- uri = "bq://lakeyk-test-limited.inference_batch_prediction_input.1317387725199900672_1b" ,
130- prompt_column = "request" ,
131- candidate_response_columns = {
132- "baseline_model_response" : "baseline_model_response" ,
133- "checkpoint_1" : "checkpoint_1" ,
134- "checkpoint_2" : "checkpoint_2" ,
135- },
139+ "projects/503583131166/locations/us-central1/evaluationSets/3122155626046685184"
136140 )
141+ assert evaluation_run .data_source .evaluation_set == "projects/503583131166/locations/us-central1/evaluationSets/3122155626046685184"
137142 assert evaluation_run .evaluation_run_results .evaluation_set == (
138- "projects/503583131166/locations/us-central1/evaluationSets/102386522778501120 "
143+ "projects/503583131166/locations/us-central1/evaluationSets/129513673658990592 "
139144 )
140145 assert evaluation_run .inference_configs == {
141- "checkpoint_1" : types .EvaluationRunInferenceConfig (
142- model = "projects/503583131166/locations/us-central1/endpoints/9030177948249882624"
143- ),
144- "checkpoint_2" : types .EvaluationRunInferenceConfig (
145- model = "projects/503583131166/locations/us-central1/endpoints/7751155654076661760"
146+ "gemini-2.0-flash-001@default" : types .EvaluationRunInferenceConfig (
147+ agent_config = types .EvaluationRunAgentConfig (
148+ developer_instruction = {
149+ "parts" : [
150+ {
151+ "text" : "example agent developer instruction"
152+ }
153+ ]
154+ },
155+ tools = [
156+ genai_types .Tool (
157+ function_declarations = [
158+ genai_types .FunctionDeclaration (
159+ name = "check_chime" ,
160+ description = "Check chime." ,
161+ parameters = {
162+ "type" : "OBJECT" ,
163+ "properties" : {
164+ "nums" : {
165+ "type" : "STRING" ,
166+ "description" : "List of numbers to be verified."
167+ }
168+ },
169+ "required" : [
170+ "nums"
171+ ]
172+ },
173+ ),
174+ ],
175+ )
176+ ],
177+ )
146178 ),
147179 }
148180 assert evaluation_run .evaluation_run_results .summary_metrics == (
149181 types .SummaryMetric (
150182 metrics = {
151- "checkpoint_1/user_defined/MODE" : 5 ,
152- "checkpoint_2/universal/P90" : 1 ,
153- "gemini-2.0-flash-001@default/universal/AVERAGE" : 0.6943817985685249 ,
154- "gemini-2.0-flash-001@default/user_defined/P90" : 5 ,
155- "gemini-2.0-flash-001@default/universal/VARIANCE" : 0.03146487552180889 ,
156- "gemini-2.0-flash-001@default/user_defined/P95" : 5 ,
157- "checkpoint_1/universal/MINIMUM" : 0.8571428656578064 ,
158- "checkpoint_1/universal/VARIANCE" : 0.0015452162403157982 ,
159- "gemini-2.0-flash-001@default/universal/STANDARD_DEVIATION" : 0.17738341388587855 ,
160- "checkpoint_2/user_defined/P95" : 5 ,
161- "checkpoint_2/universal/MODE" : 1 ,
162- "checkpoint_2/user_defined/P90" : 5 ,
163- "checkpoint_2/universal/P99" : 1 ,
164- "gemini-2.0-flash-001@default/universal/MAXIMUM" : 1 ,
165- "checkpoint_2/universal/P95" : 1 ,
166- "checkpoint_2/user_defined/P99" : 5 ,
167- "checkpoint_2/universal/MINIMUM" : 0.7777777910232544 ,
168- "gemini-2.0-flash-001@default/universal/P90" : 0.8777777791023255 ,
169- "checkpoint_1/universal/AVERAGE" : 0.986633250587865 ,
170- "checkpoint_1/universal/MAXIMUM" : 1 ,
171- "checkpoint_1/universal/STANDARD_DEVIATION" : 0.0393092386127714 ,
172- "gemini-2.0-flash-001@default/universal/P95" : 0.9000000059604645 ,
173- "gemini-2.0-flash-001@default/user_defined/MAXIMUM" : 5 ,
174- "gemini-2.0-flash-001@default/user_defined/MINIMUM" : 3 ,
175- "gemini-2.0-flash-001@default/user_defined/VARIANCE" : 0.4044321329639886 ,
176- "checkpoint_2/user_defined/MAXIMUM" : 5 ,
177- "checkpoint_1/universal/MEDIAN" : 1 ,
178- "gemini-2.0-flash-001@default/universal/MEDIAN" : 0.7142857313156128 ,
179- "gemini-2.0-flash-001@default/user_defined/AVERAGE" : 4.736842105263158 ,
180- "gemini-2.0-flash-001@default/user_defined/MEDIAN" : 5 ,
181- "checkpoint_2/user_defined/AVERAGE" : 5 ,
182- "checkpoint_2/user_defined/MEDIAN" : 5 ,
183- "checkpoint_2/user_defined/STANDARD_DEVIATION" : 0 ,
184- "checkpoint_2/universal/MAXIMUM" : 1 ,
185- "checkpoint_1/universal/MODE" : 1 ,
186- "checkpoint_2/user_defined/MINIMUM" : 5 ,
187- "checkpoint_1/user_defined/VARIANCE" : 0 ,
188- "checkpoint_2/universal/VARIANCE" : 0.005771725970062436 ,
189- "checkpoint_2/universal/AVERAGE" : 0.9438178790243048 ,
190- "checkpoint_1/user_defined/MINIMUM" : 5 ,
191- "gemini-2.0-flash-001@default/universal/P99" : 0.9800000011920929 ,
192- "gemini-2.0-flash-001@default/universal/MINIMUM" : 0.2857142984867096 ,
193- "checkpoint_2/user_defined/VARIANCE" : 0 ,
194- "checkpoint_1/user_defined/MEDIAN" : 5 ,
195- "checkpoint_2/universal/STANDARD_DEVIATION" : 0.07597187617837561 ,
196- "checkpoint_1/user_defined/AVERAGE" : 5 ,
197- "checkpoint_1/user_defined/MAXIMUM" : 5 ,
198- "gemini-2.0-flash-001@default/user_defined/MODE" : 5 ,
199- "checkpoint_1/user_defined/P95" : 5 ,
200- "checkpoint_1/universal/P99" : 1 ,
201- "checkpoint_1/user_defined/P90" : 5 ,
202- "checkpoint_2/universal/MEDIAN" : 1 ,
203- "checkpoint_1/universal/P95" : 1 ,
204- "checkpoint_1/user_defined/STANDARD_DEVIATION" : 0 ,
205- "gemini-2.0-flash-001@default/user_defined/STANDARD_DEVIATION" : 0.6359497880839245 ,
206- "checkpoint_1/user_defined/P99" : 5 ,
207- "gemini-2.0-flash-001@default/universal/MODE" : [
208- 0.75 ,
209- 0.8571428656578064 ,
210- ],
211- "checkpoint_2/user_defined/MODE" : 5 ,
212- "checkpoint_1/universal/P90" : 1 ,
213- "gemini-2.0-flash-001@default/user_defined/P99" : 5 ,
183+ "gemini-2.0-flash-001@default/safety_v1/VARIANCE" : 0.08950617055834077 ,
184+ "gemini-2.0-flash-001@default/safety_v1/MAXIMUM" : 1 ,
185+ "gemini-2.0-flash-001@default/universal/AVERAGE" : 0.7888888915379842 ,
186+ "gemini-2.0-flash-001@default/universal/P90" : 1 ,
187+ "gemini-2.0-flash-001@default/safety_v1/MEDIAN" : 1 ,
188+ "gemini-2.0-flash-001@default/universal/P95" : 1 ,
189+ "gemini-2.0-flash-001@default/universal/VARIANCE" : 0.08950617055834077 ,
190+ "gemini-2.0-flash-001@default/universal/STANDARD_DEVIATION" : 0.2991758188061675 ,
191+ "gemini-2.0-flash-001@default/universal/MEDIAN" : 1 ,
192+ "gemini-2.0-flash-001@default/safety_v1/STANDARD_DEVIATION" : 0.2991758188061675 ,
193+ "gemini-2.0-flash-001@default/universal/MODE" : 1 ,
194+ "gemini-2.0-flash-001@default/safety_v1/MODE" : 1 ,
195+ "gemini-2.0-flash-001@default/safety_v1/MINIMUM" : 0.3333333432674408 ,
196+ "gemini-2.0-flash-001@default/safety_v1/P90" : 1 ,
197+ "gemini-2.0-flash-001@default/safety_v1/P95" : 1 ,
198+ "gemini-2.0-flash-001@default/universal/P99" : 1 ,
199+ "gemini-2.0-flash-001@default/safety_v1/AVERAGE" : 0.7888888915379842 ,
200+ "gemini-2.0-flash-001@default/universal/MINIMUM" : 0.3333333432674408 ,
201+ "gemini-2.0-flash-001@default/universal/MAXIMUM" : 1 ,
202+ "gemini-2.0-flash-001@default/safety_v1/P99" : 1 ,
214203 },
215- total_items = 19 ,
204+ total_items = 3 ,
216205 )
217206 )
218207 assert evaluation_run .error is None
219208
220209
221- def check_run_1957799200510967808_evaluation_item_results (
210+ def check_run_5133048044039700480_evaluation_item_results (
222211 client , evaluation_run : types .EvaluationRun , evaluation_run_name : str
223212):
224213 eval_result = evaluation_run .evaluation_item_results
225214 assert isinstance (eval_result , types .EvaluationResult )
226215 assert eval_result .summary_metrics == [
227216 types .AggregatedMetricResult (
228- metric_name = "checkpoint_1/universal" ,
229- mean_score = 0.986633250587865 ,
230- stdev_score = 0.0393092386127714 ,
231- ),
232- types .AggregatedMetricResult (
233- metric_name = "checkpoint_2/universal" ,
234- mean_score = 0.9438178790243048 ,
235- stdev_score = 0.07597187617837561 ,
236- ),
237- types .AggregatedMetricResult (
238- metric_name = "gemini-2.0-flash-001@default/universal" ,
239- mean_score = 0.6943817985685249 ,
240- stdev_score = 0.17738341388587855 ,
217+ metric_name = "safety_v1" ,
218+ mean_score = 0.7888888915379842 ,
219+ stdev_score = 0.2991758188061675 ,
241220 ),
242221 types .AggregatedMetricResult (
243- metric_name = "checkpoint_1/user_defined" , mean_score = 5 , stdev_score = 0
244- ),
245- types .AggregatedMetricResult (
246- metric_name = "checkpoint_2/user_defined" , mean_score = 5 , stdev_score = 0
247- ),
248- types .AggregatedMetricResult (
249- metric_name = "gemini-2.0-flash-001@default/user_defined" ,
250- mean_score = 4.736842105263158 ,
251- stdev_score = 0.6359497880839245 ,
222+ metric_name = "universal" ,
223+ mean_score = 0.7888888915379842 ,
224+ stdev_score = 0.2991758188061675 ,
252225 ),
253226 ]
227+ # Check the agent info.
228+ assert eval_result .agent_info == types .AgentInfo (
229+ name = "gemini-2.0-flash-001@default" ,
230+ instruction = "example agent developer instruction" ,
231+ description = None ,
232+ tool_declarations = [
233+ genai_types .Tool (
234+ function_declarations = [
235+ genai_types .FunctionDeclaration (
236+ name = "check_chime" ,
237+ description = "Check chime." ,
238+ parameters = {
239+ "type" : "OBJECT" ,
240+ "properties" : {
241+ "nums" : {
242+ "type" : "STRING" ,
243+ "description" : "List of numbers to be verified."
244+ }
245+ },
246+ "required" : [
247+ "nums"
248+ ]
249+ },
250+ ),
251+ ],
252+ )
253+ ],
254+ )
254255 # Check the first eval case result.
255256 eval_case_result = eval_result .eval_case_results [0 ]
256257 assert isinstance (eval_case_result , types .EvalCaseResult )
@@ -275,15 +276,15 @@ def check_run_1957799200510967808_evaluation_item_results(
275276 importance = "HIGH" ,
276277 type = "LANGUAGE:PRIMARY_RESPONSE_LANGUAGE" ,
277278 ),
278- reasoning = ("The entire response is written in the English language ." ),
279+ reasoning = ("The response uses English words ." ),
279280 verdict = True ,
280281 )
281282 )
282283 # Check the first evaluation dataset.
283284 eval_dataset = eval_result .evaluation_dataset [0 ]
284285 assert isinstance (eval_dataset , types .EvaluationDataset )
285286 assert eval_dataset .candidate_name == "gemini-2.0-flash-001@default"
286- assert eval_dataset .eval_dataset_df .shape == (19 , 3 )
287+ assert eval_dataset .eval_dataset_df .shape == (3 , 3 )
287288
288289
289290pytestmark = pytest_helper .setup (
0 commit comments