diff --git a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py index cd97ab042c..d10523f563 100644 --- a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py +++ b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py @@ -18,6 +18,7 @@ from vertexai import types from google.genai import types as genai_types import pytest +import pandas as pd GCS_DEST = "gs://lakeyk-limited-bucket/eval_run_output" GENERAL_QUALITY_METRIC = types.EvaluationRunMetric( @@ -63,28 +64,46 @@ ) ), ) - +TOOL = genai_types.Tool( + function_declarations=[ + genai_types.FunctionDeclaration( + name="get_weather", + description="Get weather in a location", + parameters={ + "type": "object", + "properties": {"location": {"type": "string"}}, + }, + ) + ] +) +AGENT_INFO = types.evals.AgentInfo( + agent_resource_name="projects/123/locations/us-central1/reasoningEngines/456", + name="agent-1", + instruction="agent-1 instruction", + tool_declarations=[TOOL], +) +DEFAULT_PROMPT_TEMPLATE = "{prompt}" +INPUT_DF_WITH_CONTEXT_AND_HISTORY = pd.DataFrame( + { + "prompt": ["prompt1", "prompt2"], + "reference": ["reference1", "reference2"], + "response": ["response1", "response2"], + "context": ["context1", "context2"], + "conversation_history": ["history1", "history2"], + } +) +CANDIDATE_NAME = "candidate_1" +MODEL_NAME = "projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash" +EVAL_SET_NAME = "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" def test_create_eval_run_data_source_evaluation_set(client): """Tests that create_evaluation_run() creates a correctly structured EvaluationRun.""" client._api_client._http_options.api_version = "v1beta1" - tool = genai_types.Tool( - function_declarations=[ - genai_types.FunctionDeclaration( - name="get_weather", - description="Get weather in a location", - parameters={ - "type": "object", - "properties": {"location": {"type": "string"}}, - }, - ) - ] - ) evaluation_run = client.evals.create_evaluation_run( name="test4", display_name="test4", dataset=types.EvaluationRunDataSource( - evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" + evaluation_set=EVAL_SET_NAME ), dest=GCS_DEST, metrics=[ @@ -94,21 +113,14 @@ def test_create_eval_run_data_source_evaluation_set(client): EXACT_MATCH_COMPUTATION_BASED_METRIC, BLEU_COMPUTATION_BASED_METRIC, ], - agent_info=types.evals.AgentInfo( - agent_resource_name="project/123/locations/us-central1/reasoningEngines/456", - name="agent-1", - instruction="agent-1 instruction", - tool_declarations=[tool], - ), + agent_info=AGENT_INFO, labels={"label1": "value1"}, ) assert isinstance(evaluation_run, types.EvaluationRun) assert evaluation_run.display_name == "test4" assert evaluation_run.state == types.EvaluationRunState.PENDING assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) - assert evaluation_run.data_source.evaluation_set == ( - "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" - ) + assert evaluation_run.data_source.evaluation_set == EVAL_SET_NAME assert evaluation_run.evaluation_config == types.EvaluationRunConfig( output_config=genai_types.OutputConfig( gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) @@ -122,13 +134,13 @@ def test_create_eval_run_data_source_evaluation_set(client): ], ) assert evaluation_run.inference_configs[ - "agent-1" + AGENT_INFO.name ] == types.EvaluationRunInferenceConfig( agent_config=types.EvaluationRunAgentConfig( developer_instruction=genai_types.Content( parts=[genai_types.Part(text="agent-1 instruction")] ), - tools=[tool], + tools=[TOOL], ) ) assert evaluation_run.labels == { @@ -190,13 +202,16 @@ def test_create_eval_run_with_inference_configs(client): """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs.""" client._api_client._http_options.api_version = "v1beta1" inference_config = types.EvaluationRunInferenceConfig( - model="projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash" + model=MODEL_NAME, + prompt_template=types.EvaluationRunPromptTemplate( + prompt_template="test prompt template" + ), ) evaluation_run = client.evals.create_evaluation_run( name="test_inference_config", display_name="test_inference_config", dataset=types.EvaluationRunDataSource( - evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" + evaluation_set=EVAL_SET_NAME ), dest=GCS_DEST, metrics=[GENERAL_QUALITY_METRIC], @@ -207,9 +222,7 @@ def test_create_eval_run_with_inference_configs(client): assert evaluation_run.display_name == "test_inference_config" assert evaluation_run.state == types.EvaluationRunState.PENDING assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) - assert evaluation_run.data_source.evaluation_set == ( - "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" - ) + assert evaluation_run.data_source.evaluation_set == EVAL_SET_NAME assert evaluation_run.evaluation_config == types.EvaluationRunConfig( output_config=genai_types.OutputConfig( gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) @@ -223,9 +236,11 @@ def test_create_eval_run_with_inference_configs(client): assert evaluation_run.error is None -# Test fails in replay mode because of UUID generation mismatch. +# Dataframe tests fail in replay mode because of UUID generation mismatch. # def test_create_eval_run_data_source_evaluation_dataset(client): -# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset.""" +# """Tests that create_evaluation_run() creates a correctly structured +# EvaluationRun with EvaluationDataset. +# """ # input_df = pd.DataFrame( # { # "prompt": ["prompt1", "prompt2"], @@ -275,7 +290,7 @@ def test_create_eval_run_with_inference_configs(client): # name="test6", # display_name="test6", # dataset=types.EvaluationDataset( -# candidate_name="candidate_1", +# candidate_name=CANDIDATE_NAME, # eval_dataset_df=input_df, # ), # dest=GCS_DEST, @@ -319,6 +334,196 @@ def test_create_eval_run_with_inference_configs(client): # assert evaluation_run.error is None +# def test_create_eval_run_data_source_evaluation_dataset_with_inference_configs_and_prompt_template_data( +# client, +# ): +# """Tests that create_evaluation_run() creates a correctly structured +# EvaluationRun with EvaluationDataset and inference_configs. +# Prompt template data is inferred from the dataset and a default prompt +# template should be used. +# """ +# evaluation_run = client.evals.create_evaluation_run( +# name="test9", +# display_name="test9", +# dataset=types.EvaluationDataset( +# candidate_name=CANDIDATE_NAME, +# eval_dataset_df=INPUT_DF_WITH_CONTEXT_AND_HISTORY, +# ), +# dest=GCS_DEST, +# metrics=[GENERAL_QUALITY_METRIC], +# inference_configs={ +# CANDIDATE_NAME: types.EvaluationRunInferenceConfig( +# model=MODEL_NAME, +# ) +# }, +# ) +# assert isinstance(evaluation_run, types.EvaluationRun) +# assert evaluation_run.display_name == "test9" +# assert evaluation_run.state == types.EvaluationRunState.PENDING +# assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) +# assert evaluation_run.inference_configs[ +# CANDIDATE_NAME +# ] == types.EvaluationRunInferenceConfig( +# model=MODEL_NAME, +# prompt_template=types.EvaluationRunPromptTemplate( +# prompt_template=DEFAULT_PROMPT_TEMPLATE +# ), +# ) +# # Check evaluation set +# assert evaluation_run.data_source.evaluation_set +# eval_set = client.evals.get_evaluation_set( +# name=evaluation_run.data_source.evaluation_set +# ) +# assert len(eval_set.evaluation_items) == 2 +# # Check evaluation items +# for i, eval_item_name in enumerate(eval_set.evaluation_items): +# eval_item = client.evals.get_evaluation_item(name=eval_item_name) +# assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST +# assert ( +# eval_item.evaluation_request.prompt.prompt_template_data.values[ +# "prompt" +# ] +# == genai_types.Content( +# parts=[ +# genai_types.Part( +# text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["prompt"] +# ) +# ], +# role="user", +# ) +# ) +# assert ( +# eval_item.evaluation_request.prompt.prompt_template_data.values[ +# "context" +# ] +# == genai_types.Content( +# parts=[ +# genai_types.Part( +# text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["context"] +# ) +# ], +# role="user", +# ) +# ) +# assert ( +# eval_item.evaluation_request.prompt.prompt_template_data.values[ +# "conversation_history" +# ] +# == genai_types.Content( +# parts=[ +# genai_types.Part( +# text=( +# INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i][ +# "conversation_history" +# ] +# ) +# ) +# ], +# role="user", +# ) +# ) +# assert ( +# eval_item.evaluation_request.candidate_responses[0].text +# == INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["response"] +# ) +# assert evaluation_run.error is None + + +# def test_create_eval_run_data_source_evaluation_dataset_with_agent_info_and_prompt_template_data( +# client, +# ): +# """Tests that create_evaluation_run() creates a correctly structured +# EvaluationRun with EvaluationDataset and agent_info. +# Prompt template data is inferred from the dataset and a default prompt +# template should be used. +# """ +# evaluation_run = client.evals.create_evaluation_run( +# name="test9", +# display_name="test9", +# dataset=types.EvaluationDataset( +# candidate_name=CANDIDATE_NAME, +# eval_dataset_df=INPUT_DF_WITH_CONTEXT_AND_HISTORY, +# ), +# dest=GCS_DEST, +# metrics=[GENERAL_QUALITY_METRIC], +# agent_info=AGENT_INFO, +# ) +# assert isinstance(evaluation_run, types.EvaluationRun) +# assert evaluation_run.display_name == "test9" +# assert evaluation_run.state == types.EvaluationRunState.PENDING +# assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) +# assert evaluation_run.inference_configs[ +# AGENT_INFO.name +# ] == types.EvaluationRunInferenceConfig( +# agent_config=types.EvaluationRunAgentConfig( +# developer_instruction=genai_types.Content( +# parts=[genai_types.Part(text=AGENT_INFO.instruction)] +# ), +# tools=[TOOL], +# ), +# prompt_template=types.EvaluationRunPromptTemplate( +# prompt_template=DEFAULT_PROMPT_TEMPLATE +# ), +# ) +# # Check evaluation set +# assert evaluation_run.data_source.evaluation_set +# eval_set = client.evals.get_evaluation_set( +# name=evaluation_run.data_source.evaluation_set +# ) +# assert len(eval_set.evaluation_items) == 2 +# # Check evaluation items +# for i, eval_item_name in enumerate(eval_set.evaluation_items): +# eval_item = client.evals.get_evaluation_item(name=eval_item_name) +# assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST +# assert ( +# eval_item.evaluation_request.prompt.prompt_template_data.values[ +# "prompt" +# ] +# == genai_types.Content( +# parts=[ +# genai_types.Part( +# text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["prompt"] +# ) +# ], +# role="user", +# ) +# ) +# assert ( +# eval_item.evaluation_request.prompt.prompt_template_data.values[ +# "context" +# ] +# == genai_types.Content( +# parts=[ +# genai_types.Part( +# text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["context"] +# ) +# ], +# role="user", +# ) +# ) +# assert ( +# eval_item.evaluation_request.prompt.prompt_template_data.values[ +# "conversation_history" +# ] +# == genai_types.Content( +# parts=[ +# genai_types.Part( +# text=( +# INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i][ +# "conversation_history" +# ] +# ) +# ) +# ], +# role="user", +# ) +# ) +# assert ( +# eval_item.evaluation_request.candidate_responses[0].text +# == INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["response"] +# ) +# assert evaluation_run.error is None + pytest_plugins = ("pytest_asyncio",) @@ -371,13 +576,16 @@ async def test_create_eval_run_async_with_inference_configs(client): """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs asynchronously.""" client._api_client._http_options.api_version = "v1beta1" inference_config = types.EvaluationRunInferenceConfig( - model="projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash" + model=MODEL_NAME, + prompt_template=types.EvaluationRunPromptTemplate( + prompt_template="Test the {prompt}" + ), ) evaluation_run = await client.aio.evals.create_evaluation_run( name="test_inference_config_async", display_name="test_inference_config_async", dataset=types.EvaluationRunDataSource( - evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" + evaluation_set=EVAL_SET_NAME ), dest=GCS_DEST, metrics=[GENERAL_QUALITY_METRIC], @@ -388,9 +596,7 @@ async def test_create_eval_run_async_with_inference_configs(client): assert evaluation_run.display_name == "test_inference_config_async" assert evaluation_run.state == types.EvaluationRunState.PENDING assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) - assert evaluation_run.data_source.evaluation_set == ( - "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" - ) + assert evaluation_run.data_source.evaluation_set == EVAL_SET_NAME assert evaluation_run.evaluation_config == types.EvaluationRunConfig( output_config=genai_types.OutputConfig( gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py index f33320324a..277dacaa87 100644 --- a/vertexai/_genai/_evals_common.py +++ b/vertexai/_genai/_evals_common.py @@ -258,6 +258,145 @@ def _extract_contents_for_inference( return request_dict_or_raw_text +def _resolve_dataset( + api_client: BaseApiClient, + dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset], + dest: str, + agent_info_pydantic: Optional[types.evals.AgentInfo] = None, +) -> types.EvaluationRunDataSource: + """Resolves dataset for the evaluation run.""" + if isinstance(dataset, types.EvaluationDataset): + candidate_name = _get_candidate_name(dataset, agent_info_pydantic) + eval_set = _create_evaluation_set_from_dataframe( + api_client, + dest, + dataset.eval_dataset_df, + candidate_name, + ) + dataset = types.EvaluationRunDataSource(evaluation_set=eval_set.name) + return dataset + + +def _get_default_prompt_template( + api_client: BaseApiClient, + inference_config: types.EvaluationRunInferenceConfigOrDict, + dataset: types.EvaluationRunDataSource, +) -> Any: + """Resolves prompt template data for the evaluation run.""" + if isinstance(inference_config, dict): + if inference_config.get("prompt_template"): + return inference_config["prompt_template"] + elif inference_config.prompt_template: + return inference_config.prompt_template + + try: + evals_module = evals.Evals(api_client_=api_client) + eval_set = evals_module.get_evaluation_set(name=dataset.evaluation_set) + if eval_set and eval_set.evaluation_items: + eval_item = evals_module.get_evaluation_item( + name=eval_set.evaluation_items[0] + ) + if ( + eval_item + and eval_item.evaluation_request + and eval_item.evaluation_request.prompt + and eval_item.evaluation_request.prompt.prompt_template_data + ): + if ( + "prompt" + in eval_item.evaluation_request.prompt.prompt_template_data.values + ): + return "{prompt}" # Default prompt template + except Exception as e: + logger.warning("Failed to get prompt template from evaluation set: %s", e) + return None + + +def _resolve_inference_configs( + api_client: BaseApiClient, + dataset: types.EvaluationRunDataSource, + inference_configs: Optional[ + dict[str, types.EvaluationRunInferenceConfigOrDict] + ] = None, + agent_info_pydantic: Optional[types.evals.AgentInfo] = None, +) -> Optional[dict[str, types.EvaluationRunInferenceConfigOrDict]]: + """Resolves inference configs for the evaluation run.""" + # Resolve agent config + if agent_info_pydantic and agent_info_pydantic.name: + inference_configs = {} + inference_configs[agent_info_pydantic.name] = ( + types.EvaluationRunInferenceConfig( + agent_config=types.EvaluationRunAgentConfig( + developer_instruction=genai_types.Content( + parts=[genai_types.Part(text=agent_info_pydantic.instruction)] + ), + tools=agent_info_pydantic.tool_declarations, + ) + ) + ) + # Resolve prompt template data + if inference_configs: + for inference_config in inference_configs.values(): + prompt_template_val = ( + inference_config.get("prompt_template") + if isinstance(inference_config, dict) + else inference_config.prompt_template + ) + if not prompt_template_val: + default_prompt_template = _get_default_prompt_template( + api_client, inference_config, dataset + ) + if default_prompt_template: + prompt_template_to_set = default_prompt_template + if not isinstance( + default_prompt_template, types.EvaluationRunPromptTemplate + ): + prompt_template_to_set = types.EvaluationRunPromptTemplate( + prompt_template=default_prompt_template + ) + if isinstance(inference_config, dict): + inference_config[ + "prompt_template" + ] = prompt_template_to_set.model_dump(exclude_none=True) + else: + inference_config.prompt_template = ( + prompt_template_to_set.model_dump(exclude_none=True) + ) + return inference_configs + + +def _add_evaluation_run_labels( + labels: Optional[dict[str, str]] = None, + agent_info_pydantic: Optional[types.evals.AgentInfo] = None, +) -> Optional[dict[str, str]]: + """Adds labels to the evaluation run.""" + if agent_info_pydantic and agent_info_pydantic.agent_resource_name: + labels = labels or {} + labels["vertex-ai-evaluation-agent-engine-id"] = ( + agent_info_pydantic.agent_resource_name.split("reasoningEngines/")[-1] + ) + return labels + + +def _get_candidate_name( + dataset: types.EvaluationDataset, + agent_info_pydantic: Optional[types.evals.AgentInfo] = None, +) -> Optional[str]: + """Internal helper to get candidate name.""" + if agent_info_pydantic is not None and ( + dataset.candidate_name + and agent_info_pydantic + and agent_info_pydantic.name + and dataset.candidate_name != agent_info_pydantic.name + ): + logger.warning( + "Evaluation dataset candidate_name and agent_info.name are different. Please make sure this is intended." + ) + elif dataset.candidate_name is None and agent_info_pydantic: + return agent_info_pydantic.name + return dataset.candidate_name or None + + def _execute_inference_concurrently( api_client: BaseApiClient, prompt_dataset: pd.DataFrame, @@ -1858,6 +1997,9 @@ def _object_to_dict(obj: Any) -> Union[dict[str, Any], Any]: result[key] = value elif isinstance(value, (list, tuple)): result[key] = [_object_to_dict(item) for item in value] + # Add recursive handling for dictionaries + elif isinstance(value, dict): + result[key] = {k: _object_to_dict(v) for k, v in value.items()} elif isinstance(value, bytes): result[key] = base64.b64encode(value).decode("utf-8") elif hasattr(value, "__dict__"): # Nested object @@ -1885,13 +2027,31 @@ def _create_evaluation_set_from_dataframe( for event in row[_evals_constant.INTERMEDIATE_EVENTS]: if CONTENT in event: intermediate_events.append(event[CONTENT]) + if _evals_constant.CONTEXT in row or _evals_constant.HISTORY in row: + values = {} + if _evals_constant.CONTEXT in row: + values[_evals_constant.CONTEXT] = genai_types.Content( + parts=[genai_types.Part(text=row[_evals_constant.CONTEXT])], + role=_evals_constant.USER_AUTHOR, + ) + if _evals_constant.HISTORY in row: + values[_evals_constant.HISTORY] = genai_types.Content( + parts=[genai_types.Part(text=row[_evals_constant.HISTORY])], + role=_evals_constant.USER_AUTHOR, + ) + if _evals_constant.PROMPT in row: + values[_evals_constant.PROMPT] = genai_types.Content( + parts=[genai_types.Part(text=row[_evals_constant.PROMPT])], + role=_evals_constant.USER_AUTHOR, + ) + prompt = types.EvaluationPrompt( + prompt_template_data=types.PromptTemplateData(values=values) + ) + elif _evals_constant.PROMPT in row: + prompt = types.EvaluationPrompt(text=row[_evals_constant.PROMPT]) eval_item_requests.append( types.EvaluationItemRequest( - prompt=( - types.EvaluationPrompt(text=row[_evals_constant.PROMPT]) - if _evals_constant.PROMPT in row - else None - ), + prompt=prompt if prompt else None, golden_response=( types.CandidateResponse(text=row[_evals_constant.REFERENCE]) if _evals_constant.REFERENCE in row diff --git a/vertexai/_genai/_evals_constant.py b/vertexai/_genai/_evals_constant.py index 6fc27d94e0..847140dc5c 100644 --- a/vertexai/_genai/_evals_constant.py +++ b/vertexai/_genai/_evals_constant.py @@ -53,6 +53,7 @@ CONTENT = "content" PARTS = "parts" USER_AUTHOR = "user" +HISTORY = "conversation_history" COMMON_DATASET_COLUMNS = frozenset( { @@ -61,5 +62,6 @@ REFERENCE, SESSION_INPUT, CONTEXT, + HISTORY, } ) diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py index 3632628b87..73922955cd 100644 --- a/vertexai/_genai/evals.py +++ b/vertexai/_genai/evals.py @@ -253,6 +253,9 @@ def _EvaluationRunConfig_from_vertex( if getv(from_object, ["autoraterConfig"]) is not None: setv(to_object, ["autorater_config"], getv(from_object, ["autoraterConfig"])) + if getv(from_object, ["promptTemplate"]) is not None: + setv(to_object, ["prompt_template"], getv(from_object, ["promptTemplate"])) + return to_object @@ -277,6 +280,9 @@ def _EvaluationRunConfig_to_vertex( if getv(from_object, ["autorater_config"]) is not None: setv(to_object, ["autoraterConfig"], getv(from_object, ["autorater_config"])) + if getv(from_object, ["prompt_template"]) is not None: + setv(to_object, ["promptTemplate"], getv(from_object, ["prompt_template"])) + return to_object @@ -1625,32 +1631,14 @@ def create_evaluation_run( raise ValueError( "At most one of agent_info or inference_configs can be provided." ) - agent_info_pydantic: types.evals.AgentInfo = types.evals.AgentInfo() - if agent_info: - if isinstance(agent_info, dict): - agent_info_pydantic = types.evals.AgentInfo.model_validate(agent_info) - else: - agent_info_pydantic = agent_info - if isinstance(dataset, types.EvaluationDataset): - if dataset.eval_dataset_df is None: - raise ValueError( - "EvaluationDataset must have eval_dataset_df populated." - ) - if agent_info_pydantic is not None and ( - dataset.candidate_name - and agent_info_pydantic - and agent_info_pydantic.name - and dataset.candidate_name != agent_info_pydantic.name - ): - logger.warning( - "Evaluation dataset candidate_name and agent_info.name are different. Please make sure this is intended." - ) - elif dataset.candidate_name is None and agent_info_pydantic: - dataset.candidate_name = agent_info_pydantic.name - eval_set = _evals_common._create_evaluation_set_from_dataframe( - self._api_client, dest, dataset.eval_dataset_df, dataset.candidate_name - ) - dataset = types.EvaluationRunDataSource(evaluation_set=eval_set.name) + agent_info_pydantic = ( + types.evals.AgentInfo.model_validate(agent_info) + if isinstance(agent_info, dict) + else (agent_info or types.evals.AgentInfo()) + ) + resolved_dataset = _evals_common._resolve_dataset( + self._api_client, dataset, dest, agent_info_pydantic + ) output_config = genai_types.OutputConfig( gcs_destination=genai_types.GcsDestination(output_uri_prefix=dest) ) @@ -1660,37 +1648,20 @@ def create_evaluation_run( evaluation_config = types.EvaluationRunConfig( output_config=output_config, metrics=resolved_metrics ) - if agent_info_pydantic and agent_info_pydantic.name is not None: - inference_configs = {} - inference_configs[agent_info_pydantic.name] = ( - types.EvaluationRunInferenceConfig( - agent_config=types.EvaluationRunAgentConfig( - developer_instruction=genai_types.Content( - parts=[ - genai_types.Part(text=agent_info_pydantic.instruction) - ] - ), - tools=agent_info_pydantic.tool_declarations, - ) - ) - ) - if agent_info_pydantic.agent_resource_name: - labels = labels or {} - labels["vertex-ai-evaluation-agent-engine-id"] = ( - agent_info_pydantic.agent_resource_name.split("reasoningEngines/")[ - -1 - ] - ) - if not name: - name = f"evaluation_run_{uuid.uuid4()}" - + resolved_inference_configs = _evals_common._resolve_inference_configs( + self._api_client, resolved_dataset, inference_configs, agent_info_pydantic + ) + resolved_labels = _evals_common._add_evaluation_run_labels( + labels, agent_info_pydantic + ) + resolved_name = name or f"evaluation_run_{uuid.uuid4()}" return self._create_evaluation_run( - name=name, - display_name=display_name or name, - data_source=dataset, + name=resolved_name, + display_name=display_name or resolved_name, + data_source=resolved_dataset, evaluation_config=evaluation_config, - inference_configs=inference_configs, - labels=labels, + inference_configs=resolved_inference_configs, + labels=resolved_labels, config=config, ) @@ -2495,27 +2466,14 @@ async def create_evaluation_run( raise ValueError( "At most one of agent_info or inference_configs can be provided." ) - if agent_info and isinstance(agent_info, dict): - agent_info = types.evals.AgentInfo.model_validate(agent_info) - if isinstance(dataset, types.EvaluationDataset): - if dataset.eval_dataset_df is None: - raise ValueError( - "EvaluationDataset must have eval_dataset_df populated." - ) - if agent_info is not None and ( - dataset.candidate_name - and agent_info.name - and dataset.candidate_name != agent_info.name - ): - logger.warning( - "Evaluation dataset candidate_name and agent_info.name are different. Please make sure this is intended." - ) - elif dataset.candidate_name is None and agent_info: - dataset.candidate_name = agent_info.name - eval_set = _evals_common._create_evaluation_set_from_dataframe( - self._api_client, dest, dataset.eval_dataset_df, dataset.candidate_name - ) - dataset = types.EvaluationRunDataSource(evaluation_set=eval_set.name) + agent_info_pydantic = ( + types.evals.AgentInfo.model_validate(agent_info) + if isinstance(agent_info, dict) + else (agent_info or types.evals.AgentInfo()) + ) + resolved_dataset = _evals_common._resolve_dataset( + self._api_client, dataset, dest, agent_info_pydantic + ) output_config = genai_types.OutputConfig( gcs_destination=genai_types.GcsDestination(output_uri_prefix=dest) ) @@ -2525,31 +2483,21 @@ async def create_evaluation_run( evaluation_config = types.EvaluationRunConfig( output_config=output_config, metrics=resolved_metrics ) - if agent_info and agent_info.name is not None: - inference_configs = {} - inference_configs[agent_info.name] = types.EvaluationRunInferenceConfig( - agent_config=types.EvaluationRunAgentConfig( - developer_instruction=genai_types.Content( - parts=[genai_types.Part(text=agent_info.instruction)] - ), - tools=agent_info.tool_declarations, - ) - ) - if agent_info.agent_resource_name: - labels = labels or {} - labels["vertex-ai-evaluation-agent-engine-id"] = ( - agent_info.agent_resource_name.split("reasoningEngines/")[-1] - ) - if not name: - name = f"evaluation_run_{uuid.uuid4()}" + resolved_inference_configs = _evals_common._resolve_inference_configs( + self._api_client, resolved_dataset, inference_configs, agent_info_pydantic + ) + resolved_labels = _evals_common._add_evaluation_run_labels( + labels, agent_info_pydantic + ) + resolved_name = name or f"evaluation_run_{uuid.uuid4()}" result = await self._create_evaluation_run( - name=name, - display_name=display_name or name, - data_source=dataset, + name=resolved_name, + display_name=display_name or resolved_name, + data_source=resolved_dataset, evaluation_config=evaluation_config, - inference_configs=inference_configs, - labels=labels, + inference_configs=resolved_inference_configs, + labels=resolved_labels, config=config, ) diff --git a/vertexai/_genai/types/__init__.py b/vertexai/_genai/types/__init__.py index 8b02bc222c..9a7248ad23 100644 --- a/vertexai/_genai/types/__init__.py +++ b/vertexai/_genai/types/__init__.py @@ -362,6 +362,9 @@ from .common import EvaluationRunMetricDict from .common import EvaluationRunMetricOrDict from .common import EvaluationRunOrDict +from .common import EvaluationRunPromptTemplate +from .common import EvaluationRunPromptTemplateDict +from .common import EvaluationRunPromptTemplateOrDict from .common import EvaluationRunResults from .common import EvaluationRunResultsDict from .common import EvaluationRunResultsOrDict @@ -1104,6 +1107,9 @@ "EvaluationRunMetric", "EvaluationRunMetricDict", "EvaluationRunMetricOrDict", + "EvaluationRunPromptTemplate", + "EvaluationRunPromptTemplateDict", + "EvaluationRunPromptTemplateOrDict", "EvaluationRunConfig", "EvaluationRunConfigDict", "EvaluationRunConfigOrDict", diff --git a/vertexai/_genai/types/common.py b/vertexai/_genai/types/common.py index 2ec662eded..561929c10c 100644 --- a/vertexai/_genai/types/common.py +++ b/vertexai/_genai/types/common.py @@ -1098,6 +1098,38 @@ class EvaluationRunMetricDict(TypedDict, total=False): EvaluationRunMetricOrDict = Union[EvaluationRunMetric, EvaluationRunMetricDict] +class EvaluationRunPromptTemplate(_common.BaseModel): + """Prompt template used for inference.""" + + prompt_template: Optional[str] = Field( + default=None, + description="""Inline prompt template. Template variables should be in the format + "{var_name}".""", + ) + gcs_uri: Optional[str] = Field( + default=None, + description="""Prompt template stored in Cloud Storage. Format: + "gs://my-bucket/file-name.txt".""", + ) + + +class EvaluationRunPromptTemplateDict(TypedDict, total=False): + """Prompt template used for inference.""" + + prompt_template: Optional[str] + """Inline prompt template. Template variables should be in the format + "{var_name}".""" + + gcs_uri: Optional[str] + """Prompt template stored in Cloud Storage. Format: + "gs://my-bucket/file-name.txt".""" + + +EvaluationRunPromptTemplateOrDict = Union[ + EvaluationRunPromptTemplate, EvaluationRunPromptTemplateDict +] + + class EvaluationRunConfig(_common.BaseModel): """The evaluation configuration used for the evaluation run.""" @@ -1111,6 +1143,9 @@ class EvaluationRunConfig(_common.BaseModel): autorater_config: Optional[genai_types.AutoraterConfig] = Field( default=None, description="""The autorater config for the evaluation run.""" ) + prompt_template: Optional[EvaluationRunPromptTemplate] = Field( + default=None, description="""The prompt template used for inference.""" + ) class EvaluationRunConfigDict(TypedDict, total=False): @@ -1125,6 +1160,9 @@ class EvaluationRunConfigDict(TypedDict, total=False): autorater_config: Optional[genai_types.AutoraterConfigDict] """The autorater config for the evaluation run.""" + prompt_template: Optional[EvaluationRunPromptTemplateDict] + """The prompt template used for inference.""" + EvaluationRunConfigOrDict = Union[EvaluationRunConfig, EvaluationRunConfigDict] @@ -1843,6 +1881,9 @@ class EvaluationRunInferenceConfig(_common.BaseModel): default=None, description="""The fully qualified name of the publisher model or endpoint to use for inference.""", ) + prompt_template: Optional[EvaluationRunPromptTemplate] = Field( + default=None, description="""The prompt template used for inference.""" + ) class EvaluationRunInferenceConfigDict(TypedDict, total=False): @@ -1857,6 +1898,9 @@ class EvaluationRunInferenceConfigDict(TypedDict, total=False): model: Optional[str] """The fully qualified name of the publisher model or endpoint to use for inference.""" + prompt_template: Optional[EvaluationRunPromptTemplateDict] + """The prompt template used for inference.""" + EvaluationRunInferenceConfigOrDict = Union[ EvaluationRunInferenceConfig, EvaluationRunInferenceConfigDict