feat: GenAI Client(evals) - Add inference_configs to create_evaluation_run method in Vertex AI GenAI SDK evals

vertex-sdk-bot · copybara-github · commit 5b5e6bdd73e6 · 2025-10-20T09:58:37.000-07:00
PiperOrigin-RevId: 821689846
diff --git a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py
@@ -16,17 +16,36 @@
 
 from tests.unit.vertexai.genai.replays import pytest_helper
 from vertexai import types
+from google.genai import types as genai_types
 import pytest
 
 
 def test_create_eval_run_data_source_evaluation_set(client):
     """Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
+    client._api_client._http_options.api_version = "v1beta1"
+    tool = genai_types.Tool(
+        function_declarations=[
+            genai_types.FunctionDeclaration(
+                name="get_weather",
+                description="Get weather in a location",
+                parameters={
+                    "type": "object",
+                    "properties": {"location": {"type": "string"}},
+                },
+            )
+        ]
+    )
     evaluation_run = client.evals.create_evaluation_run(
         name="test4",
         display_name="test4",
         data_source=types.EvaluationRunDataSource(
             evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
         ),
+        agent_info=types.AgentInfo(
+            name="agent-1",
+            instruction="agent-1 instruction",
+            tool_declarations=[tool],
+        ),
         dest="gs://lakeyk-test-limited/eval_run_output",
     )
     assert isinstance(evaluation_run, types.EvaluationRun)
@@ -36,6 +55,16 @@ def test_create_eval_run_data_source_evaluation_set(client):
     assert evaluation_run.data_source.evaluation_set == (
         "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
     )
+    assert evaluation_run.inference_configs[
+        "agent-1"
+    ] == types.EvaluationRunInferenceConfig(
+        agent_config=types.EvaluationRunAgentConfig(
+            developer_instruction=genai_types.Content(
+                parts=[genai_types.Part(text="agent-1 instruction")]
+            ),
+            tools=[tool],
+        )
+    )
     assert evaluation_run.error is None
 
 
@@ -72,6 +101,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
             },
         )
     )
+    assert evaluation_run.inference_configs is None
     assert evaluation_run.error is None
 
 
@@ -108,6 +138,8 @@ async def test_create_eval_run_async(client):
             "checkpoint_2": "checkpoint_2",
         },
     )
+    assert evaluation_run.inference_configs is None
+    assert evaluation_run.error is None
 
 
 pytestmark = pytest_helper.setup(
diff --git a/tests/unit/vertexai/genai/replays/test_get_evaluation_run.py b/tests/unit/vertexai/genai/replays/test_get_evaluation_run.py
@@ -137,6 +137,14 @@ def check_run_1957799200510967808(
     assert evaluation_run.evaluation_run_results.evaluation_set == (
         "projects/503583131166/locations/us-central1/evaluationSets/102386522778501120"
     )
+    assert evaluation_run.inference_configs == {
+        "checkpoint_1": types.EvaluationRunInferenceConfig(
+            model="projects/503583131166/locations/us-central1/endpoints/9030177948249882624"
+        ),
+        "checkpoint_2": types.EvaluationRunInferenceConfig(
+            model="projects/503583131166/locations/us-central1/endpoints/7751155654076661760"
+        ),
+    }
     assert evaluation_run.evaluation_run_results.summary_metrics == (
         types.SummaryMetric(
             metrics={
diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py
@@ -80,6 +80,9 @@ def _CreateEvaluationRunParameters_to_vertex(
     if getv(from_object, ["config"]) is not None:
         setv(to_object, ["config"], getv(from_object, ["config"]))
 
+    if getv(from_object, ["inference_configs"]) is not None:
+        setv(to_object, ["inferenceConfigs"], getv(from_object, ["inference_configs"]))
+
     return to_object
 
 
@@ -227,6 +230,9 @@ def _EvaluationRun_from_vertex(
             getv(from_object, ["evaluationResults"]),
         )
 
+    if getv(from_object, ["inferenceConfigs"]) is not None:
+        setv(to_object, ["inference_configs"], getv(from_object, ["inferenceConfigs"]))
+
     return to_object
 
 
@@ -456,6 +462,9 @@ def _create_evaluation_run(
         data_source: types.EvaluationRunDataSourceOrDict,
         evaluation_config: genai_types.EvaluationConfigOrDict,
         config: Optional[types.CreateEvaluationRunConfigOrDict] = None,
+        inference_configs: Optional[
+            dict[str, types.EvaluationRunInferenceConfigOrDict]
+        ] = None,
     ) -> types.EvaluationRun:
         """
         Creates an EvaluationRun.
@@ -467,6 +476,7 @@ def _create_evaluation_run(
             data_source=data_source,
             evaluation_config=evaluation_config,
             config=config,
+            inference_configs=inference_configs,
         )
 
         request_url_dict: Optional[dict[str, str]]
@@ -1289,19 +1299,34 @@ def create_evaluation_run(
         display_name: Optional[str] = None,
         data_source: types.EvaluationRunDataSource,
         dest: str,
+        agent_info: Optional[types.AgentInfo] = None,
         config: Optional[types.CreateEvaluationRunConfigOrDict] = None,
     ) -> types.EvaluationRun:
         """Creates an EvaluationRun."""
         output_config = genai_types.OutputConfig(
             gcs_destination=genai_types.GcsDestination(output_uri_prefix=dest)
         )
         evaluation_config = genai_types.EvaluationConfig(output_config=output_config)
+        inference_configs = {}
+        if agent_info:
+            logger.warning(
+                "The agent_info field is experimental and may change in future versions."
+            )
+            inference_configs[agent_info.name] = types.EvaluationRunInferenceConfig(
+                agent_config=types.EvaluationRunAgentConfig(
+                    developer_instruction=genai_types.Content(
+                        parts=[genai_types.Part(text=agent_info.instruction)]
+                    ),
+                    tools=agent_info.tool_declarations,
+                )
+            )
 
         return self._create_evaluation_run(  # type: ignore[no-any-return]
             name=name,
             display_name=display_name,
             data_source=data_source,
             evaluation_config=evaluation_config,
+            inference_configs=inference_configs,
             config=config,
         )
 
@@ -1509,6 +1534,9 @@ async def _create_evaluation_run(
         data_source: types.EvaluationRunDataSourceOrDict,
         evaluation_config: genai_types.EvaluationConfigOrDict,
         config: Optional[types.CreateEvaluationRunConfigOrDict] = None,
+        inference_configs: Optional[
+            dict[str, types.EvaluationRunInferenceConfigOrDict]
+        ] = None,
     ) -> types.EvaluationRun:
         """
         Creates an EvaluationRun.
@@ -1520,6 +1548,7 @@ async def _create_evaluation_run(
             data_source=data_source,
             evaluation_config=evaluation_config,
             config=config,
+            inference_configs=inference_configs,
         )
 
         request_url_dict: Optional[dict[str, str]]
@@ -2055,19 +2084,34 @@ async def create_evaluation_run(
         display_name: Optional[str] = None,
         data_source: types.EvaluationRunDataSource,
         dest: str,
+        agent_info: Optional[types.AgentInfo] = None,
         config: Optional[types.CreateEvaluationRunConfigOrDict] = None,
     ) -> types.EvaluationRun:
         """Creates an EvaluationRun."""
         output_config = genai_types.OutputConfig(
             gcs_destination=genai_types.GcsDestination(output_uri_prefix=dest)
         )
         evaluation_config = genai_types.EvaluationConfig(output_config=output_config)
+        inference_configs = {}
+        if agent_info:
+            logger.warning(
+                "The agent_info field is experimental and may change in future versions."
+            )
+            inference_configs[agent_info.name] = types.EvaluationRunInferenceConfig(
+                agent_config=types.EvaluationRunAgentConfig(
+                    developer_instruction=genai_types.Content(
+                        parts=[genai_types.Part(text=agent_info.instruction)]
+                    ),
+                    tools=agent_info.tool_declarations,
+                )
+            )
 
         result = await self._create_evaluation_run(  # type: ignore[no-any-return]
             name=name,
             display_name=display_name,
             data_source=data_source,
             evaluation_config=evaluation_config,
+            inference_configs=inference_configs,
             config=config,
         )
 
diff --git a/vertexai/_genai/types.py b/vertexai/_genai/types.py
@@ -995,6 +995,9 @@ class _CreateEvaluationRunParameters(_common.BaseModel):
     config: Optional[CreateEvaluationRunConfig] = Field(
         default=None, description=""""""
     )
+    inference_configs: Optional[dict[str, "EvaluationRunInferenceConfig"]] = Field(
+        default=None, description=""""""
+    )
 
 
 class _CreateEvaluationRunParametersDict(TypedDict, total=False):
@@ -1015,6 +1018,9 @@ class _CreateEvaluationRunParametersDict(TypedDict, total=False):
     config: Optional[CreateEvaluationRunConfigDict]
     """"""
 
+    inference_configs: Optional[dict[str, "EvaluationRunInferenceConfigDict"]]
+    """"""
+
 
 _CreateEvaluationRunParametersOrDict = Union[
     _CreateEvaluationRunParameters, _CreateEvaluationRunParametersDict
@@ -1678,6 +1684,32 @@ class EvaluationRun(_common.BaseModel):
         default=None,
         description="""The parsed EvaluationItem results for the evaluation run. This is only populated when include_evaluation_items is set to True.""",
     )
+    inference_configs: Optional[dict[str, "EvaluationRunInferenceConfig"]] = Field(
+        default=None,
+        description="""This field is experimental and may change in future versions. The inference configs for the evaluation run.""",
+    )
+
+    # TODO(b/448806531): Remove all the overridden _from_response methods once the
+    # ticket is resolved and published.
+    @classmethod
+    def _from_response(
+        cls: typing.Type["EvaluationRun"],
+        *,
+        response: dict[str, object],
+        kwargs: dict[str, object],
+    ) -> "EvaluationRun":
+        """Converts a dictionary response into a EvaluationRun object."""
+
+        snaked_response = _camel_key_to_snake(response)
+        if (
+            "evaluation_run_results" in response
+            and "summaryMetrics" in response["evaluation_run_results"]
+        ):
+            snaked_response["evaluation_run_results"]["summary_metrics"] = response[
+                "evaluation_run_results"
+            ]["summaryMetrics"]
+        result = super()._from_response(response=snaked_response, kwargs=kwargs)
+        return result
 
     def show(self) -> None:
         """Shows the evaluation result."""
@@ -1734,6 +1766,9 @@ class EvaluationRunDict(TypedDict, total=False):
     evaluation_item_results: Optional[EvaluationResultDict]
     """The parsed EvaluationItem results for the evaluation run. This is only populated when include_evaluation_items is set to True."""
 
+    inference_configs: Optional[dict[str, "EvaluationRunInferenceConfigDict"]]
+    """This field is experimental and may change in future versions. The inference configs for the evaluation run."""
+
 
 EvaluationRunOrDict = Union[EvaluationRun, EvaluationRunDict]
 
@@ -11867,6 +11902,71 @@ class EvalCaseMetricResultDict(TypedDict, total=False):
 EvalCaseMetricResultOrDict = Union[EvalCaseMetricResult, EvalCaseMetricResultDict]
 
 
+class EvaluationRunAgentConfig(_common.BaseModel):
+    """This field is experimental and may change in future versions.
+
+    Agent config for an evaluation run.
+    """
+
+    developer_instruction: Optional[genai_types.Content] = Field(
+        default=None, description="""The developer instruction for the agent."""
+    )
+    tools: Optional[list[genai_types.Tool]] = Field(
+        default=None, description="""The tools available to the agent."""
+    )
+
+
+class EvaluationRunAgentConfigDict(TypedDict, total=False):
+    """This field is experimental and may change in future versions.
+
+    Agent config for an evaluation run.
+    """
+
+    developer_instruction: Optional[genai_types.ContentDict]
+    """The developer instruction for the agent."""
+
+    tools: Optional[list[genai_types.ToolDict]]
+    """The tools available to the agent."""
+
+
+EvaluationRunAgentConfigOrDict = Union[
+    EvaluationRunAgentConfig, EvaluationRunAgentConfigDict
+]
+
+
+class EvaluationRunInferenceConfig(_common.BaseModel):
+    """This field is experimental and may change in future versions.
+
+    Configuration that describes an agent.
+    """
+
+    agent_config: Optional[EvaluationRunAgentConfig] = Field(
+        default=None, description="""The agent config."""
+    )
+    model: Optional[str] = Field(
+        default=None,
+        description="""The fully qualified name of the publisher model or endpoint to use for inference.""",
+    )
+
+
+class EvaluationRunInferenceConfigDict(TypedDict, total=False):
+    """This field is experimental and may change in future versions.
+
+    Configuration that describes an agent.
+    """
+
+    agent_config: Optional[EvaluationRunAgentConfigDict]
+    """The agent config."""
+
+    model: Optional[str]
+    """The fully qualified name of the publisher model or endpoint to use for inference."""
+
+
+EvaluationRunInferenceConfigOrDict = Union[
+    EvaluationRunInferenceConfig, EvaluationRunInferenceConfigDict
+]
+
+
 class SessionInput(_common.BaseModel):
     """This field is experimental and may change in future versions.