feat: Add PromptTemplateData to support context and history columns when creating Evaluation run from dataframe

vertex-sdk-bot · copybara-github · commit a39e3860bd5e · 2026-02-17T15:28:21.000-08:00
PiperOrigin-RevId: 871483777
diff --git a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py
@@ -223,7 +223,9 @@ def test_create_eval_run_with_inference_configs(client):
     assert evaluation_run.error is None
 
 
-# Test fails in replay mode because of UUID generation mismatch.
+# # Test fails in replay mode because of UUID generation mismatch.
+# import pandas as pd
+
 # def test_create_eval_run_data_source_evaluation_dataset(client):
 #     """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset."""
 #     input_df = pd.DataFrame(
@@ -319,6 +321,75 @@ def test_create_eval_run_with_inference_configs(client):
 #     assert evaluation_run.error is None
 
 
+# def test_create_eval_run_data_source_evaluation_dataset_with_prompt_template_data(
+#     client,
+# ):
+#     """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset and inference_configs."""
+#     input_df = pd.DataFrame(
+#         {
+#             "prompt": ["prompt1", "prompt2"],
+#             "reference": ["reference1", "reference2"],
+#             "response": ["response1", "response2"],
+#             "context": ["context1", "context2"],
+#             "conversation_history": ["history1", "history2"],
+#         }
+#     )
+#     evaluation_run = client.evals.create_evaluation_run(
+#         name="test9",
+#         display_name="test9",
+#         dataset=types.EvaluationDataset(
+#             candidate_name="candidate_1",
+#             eval_dataset_df=input_df,
+#         ),
+#         dest=GCS_DEST,
+#         metrics=[GENERAL_QUALITY_METRIC],
+#     )
+#     assert isinstance(evaluation_run, types.EvaluationRun)
+#     assert evaluation_run.display_name == "test9"
+#     assert evaluation_run.state == types.EvaluationRunState.PENDING
+#     assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
+#     # Check evaluation set
+#     assert evaluation_run.data_source.evaluation_set
+#     eval_set = client.evals.get_evaluation_set(
+#         name=evaluation_run.data_source.evaluation_set
+#     )
+#     assert len(eval_set.evaluation_items) == 2
+#     # Check evaluation items
+#     for i, eval_item_name in enumerate(eval_set.evaluation_items):
+#         eval_item = client.evals.get_evaluation_item(name=eval_item_name)
+#         assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
+#         assert (
+#             eval_item.evaluation_request.prompt.prompt_template_data.values[
+#                 "prompt"
+#             ]
+#             == genai_types.Content(
+#                 parts=[genai_types.Part(text=input_df.iloc[i]["prompt"])],
+#                 role="user",
+#             )
+#         )
+#         assert (
+#             eval_item.evaluation_request.prompt.prompt_template_data.values[
+#                 "context"
+#             ]
+#             == genai_types.Content(
+#                 parts=[genai_types.Part(text=input_df.iloc[i]["context"])],
+#                 role="user",
+#             )
+#         )
+#         assert (
+#             eval_item.evaluation_request.prompt.prompt_template_data.values[
+#                 "conversation_history"
+#             ]
+#             == genai_types.Content(
+#                 parts=[genai_types.Part(text=input_df.iloc[i]["conversation_history"])],
+#                 role="user",
+#             )
+#         )
+#         assert (
+#             eval_item.evaluation_request.candidate_responses[0].text
+#             == input_df.iloc[i]["response"]
+#         )
+#     assert evaluation_run.error is None
 pytest_plugins = ("pytest_asyncio",)
 
 
diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py
@@ -1858,6 +1858,9 @@ def _object_to_dict(obj: Any) -> Union[dict[str, Any], Any]:
             result[key] = value
         elif isinstance(value, (list, tuple)):
             result[key] = [_object_to_dict(item) for item in value]
+        # Add recursive handling for dictionaries
+        elif isinstance(value, dict):
+            result[key] = {k: _object_to_dict(v) for k, v in value.items()}
         elif isinstance(value, bytes):
             result[key] = base64.b64encode(value).decode("utf-8")
         elif hasattr(value, "__dict__"):  # Nested object
@@ -1885,13 +1888,31 @@ def _create_evaluation_set_from_dataframe(
             for event in row[_evals_constant.INTERMEDIATE_EVENTS]:
                 if CONTENT in event:
                     intermediate_events.append(event[CONTENT])
+        if _evals_constant.CONTEXT in row or _evals_constant.HISTORY in row:
+            values = {}
+            if _evals_constant.CONTEXT in row:
+                values[_evals_constant.CONTEXT] = genai_types.Content(
+                    parts=[genai_types.Part(text=row[_evals_constant.CONTEXT])],
+                    role=_evals_constant.USER_AUTHOR,
+                )
+            if _evals_constant.HISTORY in row:
+                values[_evals_constant.HISTORY] = genai_types.Content(
+                    parts=[genai_types.Part(text=row[_evals_constant.HISTORY])],
+                    role=_evals_constant.USER_AUTHOR,
+                )
+            if _evals_constant.PROMPT in row:
+                values[_evals_constant.PROMPT] = genai_types.Content(
+                    parts=[genai_types.Part(text=row[_evals_constant.PROMPT])],
+                    role=_evals_constant.USER_AUTHOR,
+                )
+            prompt = types.EvaluationPrompt(
+                prompt_template_data=types.PromptTemplateData(values=values)
+            )
+        elif _evals_constant.PROMPT in row:
+            prompt = types.EvaluationPrompt(text=row[_evals_constant.PROMPT])
         eval_item_requests.append(
             types.EvaluationItemRequest(
-                prompt=(
-                    types.EvaluationPrompt(text=row[_evals_constant.PROMPT])
-                    if _evals_constant.PROMPT in row
-                    else None
-                ),
+                prompt=prompt if prompt else None,
                 golden_response=(
                     types.CandidateResponse(text=row[_evals_constant.REFERENCE])
                     if _evals_constant.REFERENCE in row
diff --git a/vertexai/_genai/_evals_constant.py b/vertexai/_genai/_evals_constant.py
@@ -53,6 +53,7 @@
 CONTENT = "content"
 PARTS = "parts"
 USER_AUTHOR = "user"
+HISTORY = "conversation_history"
 
 COMMON_DATASET_COLUMNS = frozenset(
     {
@@ -61,5 +62,6 @@
         REFERENCE,
         SESSION_INPUT,
         CONTEXT,
+        HISTORY,
     }
 )

Original file line number	Diff line number	Diff line change
`@@ -53,6 +53,7 @@`
`53`	`53`	`CONTENT = "content"`
`54`	`54`	`PARTS = "parts"`
`55`	`55`	`USER_AUTHOR = "user"`
	`56`	`+HISTORY = "conversation_history"`
`56`	`57`
`57`	`58`	`COMMON_DATASET_COLUMNS = frozenset(`
`58`	`59`	`{`
`@@ -61,5 +62,6 @@`
`61`	`62`	`REFERENCE,`
`62`	`63`	`SESSION_INPUT,`
`63`	`64`	`CONTEXT,`
	`65`	`+ HISTORY,`
`64`	`66`	`}`
`65`	`67`	`)`