diff --git a/src/uipath/_cli/_evals/_models/_evaluation_set.py b/src/uipath/_cli/_evals/_models/_evaluation_set.py index 78a2fb6f5..a5ab5b703 100644 --- a/src/uipath/_cli/_evals/_models/_evaluation_set.py +++ b/src/uipath/_cli/_evals/_models/_evaluation_set.py @@ -78,6 +78,14 @@ class ModelSettings(BaseModel): max_tokens: int | None = Field(default=None, alias="maxTokens") +class EvaluationSetModelSettings(BaseModel): + """Model settings configuration for evaluation sets.""" + + id: str = Field(..., alias="id") + model_name: str = Field(..., alias="modelName") + temperature: str = Field(..., alias="temperature") # Can be "same-as-agent" or numeric string + + class LLMMockingStrategy(BaseMockingStrategy): type: Literal[MockingStrategyType.LLM] = MockingStrategyType.LLM prompt: str = Field(..., alias="prompt") @@ -211,6 +219,9 @@ class EvaluationSet(BaseModel): default_factory=list, alias="evaluatorConfigs" ) evaluations: list[EvaluationItem] = Field(default_factory=list) + model_settings: list[EvaluationSetModelSettings] = Field( + default_factory=list, alias="modelSettings" + ) def extract_selected_evals(self, eval_ids) -> None: selected_evals: list[EvaluationItem] = [] @@ -239,7 +250,7 @@ class LegacyEvaluationSet(BaseModel): name: str batch_size: int = Field(10, alias="batchSize") timeout_minutes: int = Field(default=20, alias="timeoutMinutes") - model_settings: list[dict[str, Any]] = Field( + model_settings: list[EvaluationSetModelSettings] = Field( default_factory=list, alias="modelSettings" ) created_at: str = Field(alias="createdAt") diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py index 23751d266..403bec993 100644 --- a/src/uipath/_cli/_evals/_runtime.py +++ b/src/uipath/_cli/_evals/_runtime.py @@ -1,5 +1,7 @@ import json import logging +import os +import tempfile import uuid from collections import defaultdict from contextlib import contextmanager @@ -44,12 +46,14 @@ from ...eval.evaluators import BaseEvaluator from ...eval.models import EvaluationResult from ...eval.models.models import AgentExecution, EvalItemResult +from .._utils._console import ConsoleLogger from .._utils._eval_set import EvalHelpers from .._utils._parallelization import execute_parallel from ._evaluator_factory import EvaluatorFactory from ._models._evaluation_set import ( EvaluationItem, EvaluationSet, + LegacyEvaluationSet, ) from ._models._exceptions import EvaluationRuntimeException from ._models._output import ( @@ -67,6 +71,7 @@ set_execution_context, ) +logger = logging.getLogger(__name__) class ExecutionSpanExporter(SpanExporter): """Custom exporter that stores spans grouped by execution ids.""" @@ -153,6 +158,7 @@ class UiPathEvalContext: verbose: bool = False enable_mocker_cache: bool = False report_coverage: bool = False + model_settings_id: str = "default" class UiPathEvalRuntime: @@ -513,11 +519,97 @@ def _get_and_clear_execution_data( return spans, logs + async def _apply_model_settings_override(self) -> str | None: + """Apply model settings override if specified. + + Returns: + Modified entrypoint path if settings were overridden, otherwise None + """ + console = ConsoleLogger() + console.info(f"Checking model settings override with ID: '{self.context.model_settings_id}'") + + # Skip if no model settings ID specified + if not self.context.model_settings_id or self.context.model_settings_id == "default": + return None + + # Load evaluation set to get model settings + evaluation_set, _ = EvalHelpers.load_eval_set(self.context.eval_set or "") + if not hasattr(evaluation_set, 'model_settings') or not evaluation_set.model_settings: + console.warning("No model settings available in evaluation set") + return None + + # Find the specified model settings + target_model_settings = next( + (ms for ms in evaluation_set.model_settings if ms.id == self.context.model_settings_id), + None + ) + + if not target_model_settings: + logger.warning(f"Model settings ID '{self.context.model_settings_id}' not found in evaluation set") + return None + + console.info(f"Found model settings: model='{target_model_settings.model_name}', temperature='{target_model_settings.temperature}'") + + # Early exit: if both values are "same-as-agent", no override needed + if (target_model_settings.model_name == "same-as-agent" and + target_model_settings.temperature == "same-as-agent"): + console.info("Both model and temperature are 'same-as-agent', no override needed") + return None + + # Load the original entrypoint file + entrypoint_path = Path(self.context.entrypoint or "agent.json") + if not entrypoint_path.exists(): + console.warning(f"Entrypoint file '{entrypoint_path}' not found, model settings override not applicable") + return None + + with open(entrypoint_path, 'r') as f: + agent_data = json.load(f) + + # Apply model settings overrides + settings = agent_data.get("settings", {}) + original_model = settings.get("model", "") + original_temperature = settings.get("temperature", 0.0) + + console.info(f"Original agent settings: model='{original_model}', temperature={original_temperature}") + + # Override model if not "same-as-agent" + if target_model_settings.model_name != "same-as-agent": + settings["model"] = target_model_settings.model_name + + # Override temperature if not "same-as-agent" + if target_model_settings.temperature != "same-as-agent": + try: + settings["temperature"] = float(target_model_settings.temperature) + except ValueError: + logger.warning(f"Invalid temperature value: '{target_model_settings.temperature}', keeping original") + + agent_data["settings"] = settings + + # Create a temporary file with the modified agent definition + temp_fd, temp_path = tempfile.mkstemp(suffix=".json", prefix="agent_override_") + try: + with os.fdopen(temp_fd, 'w') as temp_file: + json.dump(agent_data, temp_file, indent=2) + + console.info(f"Applied model settings override: model='{settings.get('model', '')}', temperature={settings.get('temperature', 0.0)}") + return temp_path + except Exception as e: + logger.error(f"Failed to create temporary agent file: {e}") + try: + os.unlink(temp_path) + except: + pass + return None + async def execute_runtime( self, eval_item: EvaluationItem, execution_id: str ) -> UiPathEvalRunExecutionOutput: + # Apply model settings override if needed + overridden_entrypoint = await self._apply_model_settings_override() + entrypoint_to_use = overridden_entrypoint or self.context.entrypoint + runtime = await self.factory.new_runtime( - entrypoint=self.context.entrypoint or "", + entrypoint=entrypoint_to_use or "", runtime_id=execution_id, ) log_handler = self._setup_execution_logging(execution_id) @@ -551,6 +643,12 @@ async def execute_runtime( finally: await runtime.dispose() + # Clean up temporary file if it was created + if overridden_entrypoint and overridden_entrypoint != (self.context.entrypoint or ""): + try: + os.unlink(overridden_entrypoint) + except Exception as e: + logger.warning(f"Failed to clean up temporary agent file: {e}") end_time = time() spans, logs = self._get_and_clear_execution_data(execution_id) diff --git a/src/uipath/_cli/cli_eval.py b/src/uipath/_cli/cli_eval.py index 04b42fef9..736d82ae8 100644 --- a/src/uipath/_cli/cli_eval.py +++ b/src/uipath/_cli/cli_eval.py @@ -92,6 +92,12 @@ def setup_reporting_prereq(no_report: bool) -> bool: default=False, help="Report evaluation coverage", ) +@click.option( + "--model-settings-id", + type=str, + default="default", + help="Model settings ID from evaluation set to override agent settings (default: 'default')", +) def eval( entrypoint: str | None, eval_set: str | None, @@ -102,6 +108,7 @@ def eval( output_file: str | None, enable_mocker_cache: bool, report_coverage: bool, + model_settings_id: str, ) -> None: """Run an evaluation set against the agent. @@ -114,6 +121,7 @@ def eval( no_report: Do not report the evaluation results enable_mocker_cache: Enable caching for LLM mocker responses report_coverage: Report evaluation coverage + model_settings_id: Model settings ID to override agent settings """ should_register_progress_reporter = setup_reporting_prereq(no_report) @@ -148,6 +156,7 @@ def eval( eval_context.eval_set = resolved_eval_set_path eval_context.eval_ids = eval_ids eval_context.report_coverage = report_coverage + eval_context.model_settings_id = model_settings_id try: