Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 31 additions & 15 deletions tests/unit/vertexai/genai/replays/test_evaluate_instances.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,19 @@ def test_rouge_metric(client):

def test_pointwise_metric(client):
"""Tests the _evaluate_instances method with PointwiseMetricInput."""
instance_dict = {"prompt": "What is the capital of France?", "response": "Paris"}
instance_dict = {
"prompt": "What is the capital of France?",
"response": "Paris",
}
json_instance = json.dumps(instance_dict)

test_input = types.PointwiseMetricInput(
instance=types.PointwiseMetricInstance(json_instance=json_instance),
metric_spec=genai_types.PointwiseMetricSpec(
metric_prompt_template="Evaluate if the response '{response}' correctly answers the prompt '{prompt}'."
metric_prompt_template=(
"Evaluate if the response '{response}' correctly answers the"
" prompt '{prompt}'."
)
),
)
response = client.evals.evaluate_instances(
Expand All @@ -101,19 +107,20 @@ def test_pointwise_metric(client):

def test_pointwise_metric_with_agent_data(client):
"""Tests the _evaluate_instances method with PointwiseMetricInput and agent_data."""
instance_dict = {"prompt": "What is the capital of France?", "response": "Paris"}
instance_dict = {
"prompt": "What is the capital of France?",
"response": "Paris",
}
json_instance = json.dumps(instance_dict)
agent_data = types.evals.AgentData(
agent_config=types.evals.AgentConfig(
tools=types.evals.Tools(
tool=[
genai_types.Tool(
function_declarations=[
genai_types.FunctionDeclaration(name="search")
]
)
]
),
tools=[
genai_types.Tool(
function_declarations=[
genai_types.FunctionDeclaration(name="search")
]
)
],
developer_instruction=types.evals.InstanceData(text="instruction"),
),
events=types.evals.Events(
Expand All @@ -129,7 +136,10 @@ def test_pointwise_metric_with_agent_data(client):
test_input = types.PointwiseMetricInput(
instance=types.PointwiseMetricInstance(json_instance=json_instance),
metric_spec=genai_types.PointwiseMetricSpec(
metric_prompt_template="Evaluate if the response '{response}' correctly answers the prompt '{prompt}'."
metric_prompt_template=(
"Evaluate if the response '{response}' correctly answers the"
" prompt '{prompt}'."
)
),
)
response = client.evals.evaluate_instances(
Expand Down Expand Up @@ -189,7 +199,10 @@ def test_pairwise_metric_with_autorater(client):
test_input = types.PairwiseMetricInput(
instance=types.PairwiseMetricInstance(json_instance=json_instance),
metric_spec=genai_types.PairwiseMetricSpec(
metric_prompt_template="Which response is a better summary? Baseline: '{baseline_response}' or Candidate: '{candidate_response}'"
metric_prompt_template=(
"Which response is a better summary? Baseline:"
" '{baseline_response}' or Candidate: '{candidate_response}'"
)
),
)
autorater_config = genai_types.AutoraterConfig(sampling_count=2)
Expand Down Expand Up @@ -240,7 +253,10 @@ def test_inference_with_prompt_template(client):

def test_run_inference_with_agent(client):
test_df = pd.DataFrame(
{"prompt": ["agent prompt"], "session_inputs": ['{"user_id": "user_123"}']}
{
"prompt": ["agent prompt"],
"session_inputs": ['{"user_id": "user_123"}'],
}
)
inference_result = client.evals.run_inference(
agent="projects/977012026409/locations/us-central1/reasoningEngines/7188347537655332864",
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/vertexai/genai/test_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -4056,7 +4056,7 @@ def test_eval_case_to_agent_data(self):
)

assert agent_data.agent_config.developer_instruction.text == "instruction1"
assert agent_data.agent_config.tools.tool == [tool]
assert agent_data.agent_config.legacy_tools.tool == [tool]
assert agent_data.events.event[0].parts[0].text == "intermediate event"

def test_eval_case_to_agent_data_events_only(self):
Expand Down Expand Up @@ -4164,7 +4164,7 @@ def test_eval_case_to_agent_data_agent_info_empty_tools(self):
)

assert agent_data.agent_config.developer_instruction.text == "instruction1"
assert not agent_data.agent_config.tools.tool
assert not agent_data.agent_config.legacy_tools.tool

def test_eval_case_to_agent_data_agent_info_empty(self):
intermediate_events = [
Expand Down
55 changes: 49 additions & 6 deletions vertexai/_genai/_evals_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1299,21 +1299,36 @@ def _run_agent_internal(
agent=agent,
prompt_dataset=prompt_dataset,
)

agent_obj = agent_engine if agent_engine else agent

processed_intermediate_events = []
processed_responses = []
for resp_item in raw_responses:
processed_agent_data = [] # New column for AgentData

for i, resp_item in enumerate(raw_responses):
intermediate_events_row: list[dict[str, Any]] = []
response_row = None

# --- Legacy Logic: Intermediate Events & Response ---
if isinstance(resp_item, list):
try:
response_row = resp_item[-1]["content"]["parts"][0]["text"]
# Attempt to extract final response text
if resp_item and "content" in resp_item[-1]:
# Basic extraction, assumes last message is model response
final_content = resp_item[-1]["content"]
if isinstance(final_content, dict) and "parts" in final_content:
response_row = final_content["parts"][0].get("text", "")
elif hasattr(final_content, "parts"):
response_row = final_content.parts[0].text

for intermediate_event in resp_item[:-1]:
intermediate_events_row.append(
{
"event_id": intermediate_event["id"],
"content": intermediate_event["content"],
"creation_timestamp": intermediate_event["timestamp"],
"author": intermediate_event["author"],
"event_id": intermediate_event.get("id"),
"content": intermediate_event.get("content"),
"creation_timestamp": intermediate_event.get("timestamp"),
"author": intermediate_event.get("author"),
}
)
except Exception as e: # pylint: disable=broad-exception-caught
Expand All @@ -1335,6 +1350,33 @@ def _run_agent_internal(
processed_intermediate_events.append(intermediate_events_row)
processed_responses.append(response_row)

# --- New Logic: AgentData ---
agent_data_obj = None
try:
# 1. Get User Prompt for the current row
primary_prompt_column = (
"request" if "request" in prompt_dataset.columns else "prompt"
)
user_prompt_val = prompt_dataset.iloc[i][primary_prompt_column]

# 2. Construct Full Session History (User Prompt + Agent Events)
# Normalize user prompt into a message dict structure
user_event = {"role": "user", "content": user_prompt_val}

full_session_history = [user_event]
if isinstance(resp_item, list):
full_session_history.extend(resp_item)

# 3. Create AgentData using the new factory method
agent_data_obj = types.evals.AgentData.from_session(
agent_obj, full_session_history
)
except Exception as e:
logger.warning("Failed to adapt AgentData for row %d: %s", i, e)
# Proceed without AgentData; backend will fallback to legacy fields

processed_agent_data.append(agent_data_obj)

if len(processed_responses) != len(prompt_dataset) or len(
processed_responses
) != len(processed_intermediate_events):
Expand All @@ -1353,6 +1395,7 @@ def _run_agent_internal(
{
_evals_constant.INTERMEDIATE_EVENTS: processed_intermediate_events,
_evals_constant.RESPONSE: processed_responses,
"agent_data": processed_agent_data, # Populate agent_data
}
)

Expand Down
7 changes: 6 additions & 1 deletion vertexai/_genai/_evals_metric_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -879,6 +879,11 @@ def _eval_case_to_agent_data(
eval_case: types.EvalCase,
) -> Optional[types.evals.AgentData]:
"""Converts an EvalCase object to an AgentData object."""
# --- NEW LOGIC: Use the structured agent_data if present ---
if getattr(eval_case, "agent_data", None):
return eval_case.agent_data

# --- LEGACY LOGIC: Fallback for older dataframes ---
if not eval_case.agent_info and not eval_case.intermediate_events:
return None
tools = None
Expand All @@ -899,7 +904,7 @@ def _eval_case_to_agent_data(

if tools or developer_instruction:
agent_config = types.evals.AgentConfig(
tools=tools,
legacy_tools=tools,
developer_instruction=developer_instruction,
)

Expand Down
7 changes: 7 additions & 0 deletions vertexai/_genai/types/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1492,6 +1492,10 @@ class EvalCase(_common.BaseModel):
default=None,
description="""This field is experimental and may change in future versions. The agent info of the agent under evaluation. This can be extended for multi-agent evaluation.""",
)
agent_data: Optional[evals_types.AgentData] = Field(
default=None,
description="""This field is experimental and may change in future versions. The agent data of the agent under evaluation.""",
)
# Allow extra fields to support custom metric prompts and stay backward compatible.
model_config = ConfigDict(frozen=True, extra="allow")

Expand Down Expand Up @@ -1526,6 +1530,9 @@ class EvalCaseDict(TypedDict, total=False):
agent_info: Optional[evals_types.AgentInfo]
"""This field is experimental and may change in future versions. The agent info of the agent under evaluation. This can be extended for multi-agent evaluation."""

agent_data: Optional[evals_types.AgentData]
"""This field is experimental and may change in future versions. The agent data of the agent under evaluation."""


EvalCaseOrDict = Union[EvalCase, EvalCaseDict]

Expand Down
Loading
Loading