Skip to content

Commit ef62aa2

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: GenAI Client(evals) - Add support to local agent run for agent eval
PiperOrigin-RevId: 829920374
1 parent dd4775b commit ef62aa2

File tree

4 files changed

+363
-53
lines changed

4 files changed

+363
-53
lines changed

tests/unit/vertexai/genai/test_evals.py

Lines changed: 146 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1253,6 +1253,143 @@ def test_run_inference_with_agent_engine_with_response_column_raises_error(
12531253
"'intermediate_events' or 'response' columns"
12541254
) in str(excinfo.value)
12551255

1256+
@mock.patch.object(_evals_utils, "EvalDatasetLoader")
1257+
@mock.patch(
1258+
"vertexai._genai._evals_common.InMemorySessionService"
1259+
)
1260+
@mock.patch("vertexai._genai._evals_common.Runner")
1261+
@mock.patch("vertexai._genai._evals_common.LlmAgent")
1262+
def test_run_inference_with_local_agent(
1263+
self,
1264+
mock_llm_agent,
1265+
mock_runner,
1266+
mock_session_service,
1267+
mock_eval_dataset_loader,
1268+
):
1269+
mock_df = pd.DataFrame(
1270+
{
1271+
"prompt": ["agent prompt", "agent prompt 2"],
1272+
"session_inputs": [
1273+
{
1274+
"user_id": "123",
1275+
"state": {"a": "1"},
1276+
},
1277+
{
1278+
"user_id": "456",
1279+
"state": {"b": "2"},
1280+
},
1281+
],
1282+
}
1283+
)
1284+
mock_eval_dataset_loader.return_value.load.return_value = mock_df.to_dict(
1285+
orient="records"
1286+
)
1287+
1288+
mock_agent_instance = mock.Mock()
1289+
mock_llm_agent.return_value = mock_agent_instance
1290+
mock_session_service.return_value.create_session = mock.AsyncMock()
1291+
mock_runner_instance = mock_runner.return_value
1292+
stream_run_return_value_1 = [
1293+
mock.Mock(
1294+
model_dump=lambda: {
1295+
"id": "1",
1296+
"content": {"parts": [{"text": "intermediate1"}]},
1297+
"timestamp": 123,
1298+
"author": "model",
1299+
}
1300+
),
1301+
mock.Mock(
1302+
model_dump=lambda: {
1303+
"id": "2",
1304+
"content": {"parts": [{"text": "agent response"}]},
1305+
"timestamp": 124,
1306+
"author": "model",
1307+
}
1308+
),
1309+
]
1310+
stream_run_return_value_2 = [
1311+
mock.Mock(
1312+
model_dump=lambda: {
1313+
"id": "3",
1314+
"content": {"parts": [{"text": "intermediate2"}]},
1315+
"timestamp": 125,
1316+
"author": "model",
1317+
}
1318+
),
1319+
mock.Mock(
1320+
model_dump=lambda: {
1321+
"id": "4",
1322+
"content": {"parts": [{"text": "agent response 2"}]},
1323+
"timestamp": 126,
1324+
"author": "model",
1325+
}
1326+
),
1327+
]
1328+
1329+
async def async_iterator(items):
1330+
for item in items:
1331+
yield item
1332+
1333+
mock_runner_instance.run_async.side_effect = [
1334+
async_iterator(stream_run_return_value_1),
1335+
async_iterator(stream_run_return_value_2),
1336+
]
1337+
1338+
inference_result = self.client.evals.run_inference(
1339+
agent=mock_agent_instance,
1340+
src=mock_df,
1341+
)
1342+
1343+
mock_eval_dataset_loader.return_value.load.assert_called_once_with(mock_df)
1344+
assert mock_session_service.call_count == 2
1345+
mock_runner.assert_called_with(
1346+
agent=mock_agent_instance,
1347+
app_name="local agent run",
1348+
session_service=mock_session_service.return_value,
1349+
)
1350+
assert mock_runner.call_count == 2
1351+
assert mock_runner_instance.run_async.call_count == 2
1352+
1353+
pd.testing.assert_frame_equal(
1354+
inference_result.eval_dataset_df,
1355+
pd.DataFrame(
1356+
{
1357+
"prompt": ["agent prompt", "agent prompt 2"],
1358+
"session_inputs": [
1359+
{
1360+
"user_id": "123",
1361+
"state": {"a": "1"},
1362+
},
1363+
{
1364+
"user_id": "456",
1365+
"state": {"b": "2"},
1366+
},
1367+
],
1368+
"intermediate_events": [
1369+
[
1370+
{
1371+
"event_id": "1",
1372+
"content": {"parts": [{"text": "intermediate1"}]},
1373+
"creation_timestamp": 123,
1374+
"author": "model",
1375+
}
1376+
],
1377+
[
1378+
{
1379+
"event_id": "3",
1380+
"content": {"parts": [{"text": "intermediate2"}]},
1381+
"creation_timestamp": 125,
1382+
"author": "model",
1383+
}
1384+
],
1385+
],
1386+
"response": ["agent response", "agent response 2"],
1387+
}
1388+
),
1389+
)
1390+
assert inference_result.candidate_name is None
1391+
assert inference_result.gcs_source is None
1392+
12561393
def test_run_inference_with_litellm_string_prompt_format(
12571394
self,
12581395
mock_api_client_fixture,
@@ -1605,6 +1742,7 @@ def test_run_agent_internal_success(self, mock_run_agent):
16051742
result_df = _evals_common._run_agent_internal(
16061743
api_client=mock_api_client,
16071744
agent_engine=mock_agent_engine,
1745+
agent=None,
16081746
prompt_dataset=prompt_dataset,
16091747
)
16101748

@@ -1635,6 +1773,7 @@ def test_run_agent_internal_error_response(self, mock_run_agent):
16351773
result_df = _evals_common._run_agent_internal(
16361774
api_client=mock_api_client,
16371775
agent_engine=mock_agent_engine,
1776+
agent=None,
16381777
prompt_dataset=prompt_dataset,
16391778
)
16401779

@@ -1661,6 +1800,7 @@ def test_run_agent_internal_malformed_event(self, mock_run_agent):
16611800
result_df = _evals_common._run_agent_internal(
16621801
api_client=mock_api_client,
16631802
agent_engine=mock_agent_engine,
1803+
agent=None,
16641804
prompt_dataset=prompt_dataset,
16651805
)
16661806
assert "response" in result_df.columns
@@ -4990,7 +5130,9 @@ def test_execute_evaluation_adds_creation_timestamp(
49905130
frozenset(["summarization_quality"]),
49915131
)
49925132
@mock.patch("time.sleep", return_value=None)
4993-
@mock.patch("vertexai._genai.evals.Evals._evaluate_instances")
5133+
@mock.patch(
5134+
"vertexai._genai.evals.Evals._evaluate_instances"
5135+
)
49945136
def test_predefined_metric_retry_on_resource_exhausted(
49955137
self,
49965138
mock_private_evaluate_instances,
@@ -5043,7 +5185,9 @@ def test_predefined_metric_retry_on_resource_exhausted(
50435185
frozenset(["summarization_quality"]),
50445186
)
50455187
@mock.patch("time.sleep", return_value=None)
5046-
@mock.patch("vertexai._genai.evals.Evals._evaluate_instances")
5188+
@mock.patch(
5189+
"vertexai._genai.evals.Evals._evaluate_instances"
5190+
)
50475191
def test_predefined_metric_retry_fail_on_resource_exhausted(
50485192
self,
50495193
mock_private_evaluate_instances,

0 commit comments

Comments
 (0)