Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion src/uipath/_cli/_evals/_models/_evaluation_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,14 @@ class ModelSettings(BaseModel):
max_tokens: int | None = Field(default=None, alias="maxTokens")


class EvaluationSetModelSettings(BaseModel):
"""Model settings configuration for evaluation sets."""

id: str = Field(..., alias="id")
model_name: str = Field(..., alias="modelName")
temperature: str = Field(..., alias="temperature") # Can be "same-as-agent" or numeric string


class LLMMockingStrategy(BaseMockingStrategy):
type: Literal[MockingStrategyType.LLM] = MockingStrategyType.LLM
prompt: str = Field(..., alias="prompt")
Expand Down Expand Up @@ -211,6 +219,9 @@ class EvaluationSet(BaseModel):
default_factory=list, alias="evaluatorConfigs"
)
evaluations: list[EvaluationItem] = Field(default_factory=list)
model_settings: list[EvaluationSetModelSettings] = Field(
default_factory=list, alias="modelSettings"
)

def extract_selected_evals(self, eval_ids) -> None:
selected_evals: list[EvaluationItem] = []
Expand Down Expand Up @@ -239,7 +250,7 @@ class LegacyEvaluationSet(BaseModel):
name: str
batch_size: int = Field(10, alias="batchSize")
timeout_minutes: int = Field(default=20, alias="timeoutMinutes")
model_settings: list[dict[str, Any]] = Field(
model_settings: list[EvaluationSetModelSettings] = Field(
default_factory=list, alias="modelSettings"
)
created_at: str = Field(alias="createdAt")
Expand Down
100 changes: 99 additions & 1 deletion src/uipath/_cli/_evals/_runtime.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import json
import logging
import os
import tempfile
import uuid
from collections import defaultdict
from contextlib import contextmanager
Expand Down Expand Up @@ -44,12 +46,14 @@
from ...eval.evaluators import BaseEvaluator
from ...eval.models import EvaluationResult
from ...eval.models.models import AgentExecution, EvalItemResult
from .._utils._console import ConsoleLogger
from .._utils._eval_set import EvalHelpers
from .._utils._parallelization import execute_parallel
from ._evaluator_factory import EvaluatorFactory
from ._models._evaluation_set import (
EvaluationItem,
EvaluationSet,
LegacyEvaluationSet,
)
from ._models._exceptions import EvaluationRuntimeException
from ._models._output import (
Expand All @@ -67,6 +71,7 @@
set_execution_context,
)

logger = logging.getLogger(__name__)

class ExecutionSpanExporter(SpanExporter):
"""Custom exporter that stores spans grouped by execution ids."""
Expand Down Expand Up @@ -153,6 +158,7 @@ class UiPathEvalContext:
verbose: bool = False
enable_mocker_cache: bool = False
report_coverage: bool = False
model_settings_id: str = "default"


class UiPathEvalRuntime:
Expand Down Expand Up @@ -513,11 +519,97 @@ def _get_and_clear_execution_data(

return spans, logs

async def _apply_model_settings_override(self) -> str | None:
"""Apply model settings override if specified.

Returns:
Modified entrypoint path if settings were overridden, otherwise None
"""
console = ConsoleLogger()
console.info(f"Checking model settings override with ID: '{self.context.model_settings_id}'")

# Skip if no model settings ID specified
if not self.context.model_settings_id or self.context.model_settings_id == "default":
return None

# Load evaluation set to get model settings
evaluation_set, _ = EvalHelpers.load_eval_set(self.context.eval_set or "")
if not hasattr(evaluation_set, 'model_settings') or not evaluation_set.model_settings:
console.warning("No model settings available in evaluation set")
return None

# Find the specified model settings
target_model_settings = next(
(ms for ms in evaluation_set.model_settings if ms.id == self.context.model_settings_id),
None
)

if not target_model_settings:
logger.warning(f"Model settings ID '{self.context.model_settings_id}' not found in evaluation set")
return None

console.info(f"Found model settings: model='{target_model_settings.model_name}', temperature='{target_model_settings.temperature}'")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use logger, console is a CLI concept (imagine we extract the evals engine to another library)


# Early exit: if both values are "same-as-agent", no override needed
if (target_model_settings.model_name == "same-as-agent" and
target_model_settings.temperature == "same-as-agent"):
console.info("Both model and temperature are 'same-as-agent', no override needed")
return None

# Load the original entrypoint file
entrypoint_path = Path(self.context.entrypoint or "agent.json")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We shouldn't leak agent.json concepts here .. similar comment: #1048 (review)

if not entrypoint_path.exists():
console.warning(f"Entrypoint file '{entrypoint_path}' not found, model settings override not applicable")
return None

with open(entrypoint_path, 'r') as f:
agent_data = json.load(f)

# Apply model settings overrides
settings = agent_data.get("settings", {})
original_model = settings.get("model", "")
original_temperature = settings.get("temperature", 0.0)

console.info(f"Original agent settings: model='{original_model}', temperature={original_temperature}")

# Override model if not "same-as-agent"
if target_model_settings.model_name != "same-as-agent":
settings["model"] = target_model_settings.model_name

# Override temperature if not "same-as-agent"
if target_model_settings.temperature != "same-as-agent":
try:
settings["temperature"] = float(target_model_settings.temperature)
except ValueError:
logger.warning(f"Invalid temperature value: '{target_model_settings.temperature}', keeping original")

agent_data["settings"] = settings

# Create a temporary file with the modified agent definition
temp_fd, temp_path = tempfile.mkstemp(suffix=".json", prefix="agent_override_")
try:
with os.fdopen(temp_fd, 'w') as temp_file:
json.dump(agent_data, temp_file, indent=2)

console.info(f"Applied model settings override: model='{settings.get('model', '')}', temperature={settings.get('temperature', 0.0)}")
return temp_path
except Exception as e:
logger.error(f"Failed to create temporary agent file: {e}")
try:
os.unlink(temp_path)
except:
pass
return None

async def execute_runtime(
self, eval_item: EvaluationItem, execution_id: str
) -> UiPathEvalRunExecutionOutput:
# Apply model settings override if needed
overridden_entrypoint = await self._apply_model_settings_override()
entrypoint_to_use = overridden_entrypoint or self.context.entrypoint

runtime = await self.factory.new_runtime(
entrypoint=self.context.entrypoint or "",
entrypoint=entrypoint_to_use or "",
runtime_id=execution_id,
)
log_handler = self._setup_execution_logging(execution_id)
Expand Down Expand Up @@ -551,6 +643,12 @@ async def execute_runtime(

finally:
await runtime.dispose()
# Clean up temporary file if it was created
if overridden_entrypoint and overridden_entrypoint != (self.context.entrypoint or ""):
try:
os.unlink(overridden_entrypoint)
except Exception as e:
logger.warning(f"Failed to clean up temporary agent file: {e}")

end_time = time()
spans, logs = self._get_and_clear_execution_data(execution_id)
Expand Down
9 changes: 9 additions & 0 deletions src/uipath/_cli/cli_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,12 @@ def setup_reporting_prereq(no_report: bool) -> bool:
default=False,
help="Report evaluation coverage",
)
@click.option(
"--model-settings-id",
type=str,
default="default",
help="Model settings ID from evaluation set to override agent settings (default: 'default')",
)
def eval(
entrypoint: str | None,
eval_set: str | None,
Expand All @@ -102,6 +108,7 @@ def eval(
output_file: str | None,
enable_mocker_cache: bool,
report_coverage: bool,
model_settings_id: str,
) -> None:
"""Run an evaluation set against the agent.

Expand All @@ -114,6 +121,7 @@ def eval(
no_report: Do not report the evaluation results
enable_mocker_cache: Enable caching for LLM mocker responses
report_coverage: Report evaluation coverage
model_settings_id: Model settings ID to override agent settings
"""
should_register_progress_reporter = setup_reporting_prereq(no_report)

Expand Down Expand Up @@ -148,6 +156,7 @@ def eval(
eval_context.eval_set = resolved_eval_set_path
eval_context.eval_ids = eval_ids
eval_context.report_coverage = report_coverage
eval_context.model_settings_id = model_settings_id

try:

Expand Down
Loading