Skip to content

Commit b39e1cd

Browse files
committed
feat: add eval cli command
1 parent c91f8dc commit b39e1cd

23 files changed

+1977
-56
lines changed

pyproject.toml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "uipath"
3-
version = "2.1.8"
3+
version = "2.1.9"
44
description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools."
55
readme = { file = "README.md", content-type = "text/markdown" }
66
requires-python = ">=3.10"
@@ -103,7 +103,6 @@ line-ending = "auto"
103103
plugins = ["pydantic.mypy"]
104104
exclude = ["samples/.*"]
105105

106-
107106
follow_imports = "silent"
108107
warn_redundant_casts = true
109108
warn_unused_ignores = true

src/uipath/_cli/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from .cli_auth import auth as auth # type: ignore
77
from .cli_deploy import deploy as deploy # type: ignore
8+
from .cli_eval import eval as eval # type: ignore
89
from .cli_init import init as init # type: ignore
910
from .cli_invoke import invoke as invoke # type: ignore
1011
from .cli_new import new as new # type: ignore
@@ -67,3 +68,4 @@ def cli(lv: bool, v: bool) -> None:
6768
cli.add_command(invoke)
6869
cli.add_command(push)
6970
cli.add_command(pull)
71+
cli.add_command(eval)
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
"""Evaluators package for the evaluation system.
2+
3+
This package contains all evaluator types and the factory for creating them.
4+
"""
5+
6+
from ._agent_scorer_evaluator import AgentScorerEvaluator
7+
from ._deterministic_evaluator import DeterministicEvaluator
8+
from ._evaluator_base import EvaluatorBase
9+
from ._evaluator_factory import EvaluatorFactory
10+
from ._llm_as_judge_evaluator import LlmAsAJudgeEvaluator
11+
from ._trajectory_evaluator import TrajectoryEvaluator
12+
13+
__all__ = [
14+
"EvaluatorBase",
15+
"EvaluatorFactory",
16+
"DeterministicEvaluator",
17+
"LlmAsAJudgeEvaluator",
18+
"AgentScorerEvaluator",
19+
"TrajectoryEvaluator",
20+
]
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
from typing import Any, Dict
2+
3+
from .._models import EvaluationResult
4+
from ._evaluator_base import EvaluatorBase
5+
6+
7+
class AgentScorerEvaluator(EvaluatorBase):
8+
"""Evaluator that uses an agent to score outputs."""
9+
10+
def __init__(
11+
self,
12+
agent_config: Dict[str, Any],
13+
scoring_criteria: Dict[str, Any],
14+
target_output_key: str = "*",
15+
):
16+
"""Initialize the agent scorer evaluator.
17+
18+
Args:
19+
agent_config: Configuration for the scoring agent
20+
scoring_criteria: Criteria used for scoring
21+
target_output_key: Key in output to evaluate ("*" for entire output)
22+
"""
23+
super().__init__()
24+
self.agent_config = agent_config or {}
25+
self.scoring_criteria = scoring_criteria or {}
26+
self.target_output_key = target_output_key
27+
28+
async def evaluate(
29+
self,
30+
evaluation_id: str,
31+
evaluation_name: str,
32+
input_data: Dict[str, Any],
33+
expected_output: Dict[str, Any],
34+
actual_output: Dict[str, Any],
35+
) -> EvaluationResult:
36+
"""Evaluate using an agent scorer.
37+
38+
Args:
39+
evaluation_id: The ID of the evaluation being processed
40+
evaluation_name: The name of the evaluation
41+
input_data: The input data for the evaluation
42+
expected_output: The expected output
43+
actual_output: The actual output from the agent
44+
45+
Returns:
46+
EvaluationResult containing the score and details
47+
"""
48+
raise NotImplementedError()
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
from typing import Any, Dict
2+
3+
from .._models import EvaluationResult
4+
from ._evaluator_base import EvaluatorBase
5+
6+
7+
class DeterministicEvaluator(EvaluatorBase):
8+
"""Evaluator for deterministic/rule-based evaluations."""
9+
10+
def __init__(self, rule_config: Dict[str, Any], target_output_key: str = "*"):
11+
"""Initialize the deterministic evaluator.
12+
13+
Args:
14+
rule_config: Configuration for the rule (expected_value, regex_pattern, etc.)
15+
target_output_key: Key in output to evaluate ("*" for entire output)
16+
"""
17+
super().__init__()
18+
self.rule_config = rule_config or {}
19+
self.target_output_key = target_output_key
20+
21+
async def evaluate(
22+
self,
23+
evaluation_id: str,
24+
evaluation_name: str,
25+
input_data: Dict[str, Any],
26+
expected_output: Dict[str, Any],
27+
actual_output: Dict[str, Any],
28+
) -> EvaluationResult:
29+
"""Evaluate using deterministic rules.
30+
31+
Args:
32+
evaluation_id: The ID of the evaluation being processed
33+
evaluation_name: The name of the evaluation
34+
input_data: The input data for the evaluation
35+
expected_output: The expected output
36+
actual_output: The actual output from the agent
37+
38+
Returns:
39+
EvaluationResult containing the score and details
40+
"""
41+
raise NotImplementedError()
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
import functools
2+
import time
3+
from abc import ABC, abstractmethod
4+
from dataclasses import dataclass
5+
from typing import Any, Dict
6+
7+
from uipath._cli._evals._models import (
8+
EvaluationResult,
9+
EvaluatorCategory,
10+
EvaluatorType,
11+
)
12+
13+
14+
def measure_execution_time(func):
15+
"""Decorator to measure execution time and update EvaluationResult.evaluation_time."""
16+
17+
@functools.wraps(func)
18+
async def wrapper(*args, **kwargs) -> EvaluationResult:
19+
start_time = time.time()
20+
result = await func(*args, **kwargs)
21+
end_time = time.time()
22+
execution_time = end_time - start_time
23+
24+
result.evaluation_time = execution_time
25+
return result
26+
27+
return wrapper
28+
29+
30+
@dataclass
31+
class EvaluatorBaseParams:
32+
"""Parameters for initializing the base evaluator."""
33+
34+
evaluator_id: str
35+
category: EvaluatorCategory
36+
evaluator_type: EvaluatorType
37+
name: str
38+
description: str
39+
created_at: str
40+
updated_at: str
41+
target_output_key: str
42+
43+
44+
class EvaluatorBase(ABC):
45+
"""Abstract base class for all evaluators."""
46+
47+
def __init__(self):
48+
# initialization done via 'from_params' function
49+
self.id: str
50+
self.name: str
51+
self.description: str
52+
self.created_at: str
53+
self.updated_at: str
54+
self.category: EvaluatorCategory
55+
self.type: EvaluatorType
56+
self.target_output_key: str
57+
pass
58+
59+
@classmethod
60+
def from_params(cls, params: EvaluatorBaseParams, **kwargs):
61+
"""Initialize the base evaluator from parameters.
62+
63+
Args:
64+
params: EvaluatorBaseParams containing base configuration
65+
**kwargs: Additional specific parameters for concrete evaluators
66+
67+
Returns:
68+
Initialized evaluator instance
69+
"""
70+
instance = cls(**kwargs)
71+
instance.id = params.evaluator_id
72+
instance.category = params.category
73+
instance.type = params.evaluator_type
74+
instance.name = params.name
75+
instance.description = params.description
76+
instance.created_at = params.created_at
77+
instance.updated_at = params.updated_at
78+
instance.target_output_key = params.target_output_key
79+
return instance
80+
81+
@measure_execution_time
82+
@abstractmethod
83+
async def evaluate(
84+
self,
85+
evaluation_id: str,
86+
evaluation_name: str,
87+
input_data: Dict[str, Any],
88+
expected_output: Dict[str, Any],
89+
actual_output: Dict[str, Any],
90+
) -> EvaluationResult:
91+
"""Evaluate the given data and return a result.
92+
93+
Args:
94+
evaluation_id: The ID of the evaluation being processed
95+
evaluation_name: The name of the evaluation
96+
input_data: The input data for the evaluation
97+
expected_output: The expected output
98+
actual_output: The actual output from the agent
99+
100+
Returns:
101+
EvaluationResult containing the score and details
102+
"""
103+
pass
104+
105+
def to_dict(self) -> Dict[str, Any]:
106+
"""Convert the evaluator instance to a dictionary representation.
107+
108+
Returns:
109+
Dict[str, Any]: Dictionary containing all evaluator properties
110+
"""
111+
return {
112+
"id": self.id,
113+
"name": self.name,
114+
"description": self.description,
115+
"created_at": self.created_at,
116+
"updated_at": self.updated_at,
117+
"category": self.category.name if self.category else None,
118+
"type": self.type.name if self.type else None,
119+
"target_output_key": self.target_output_key,
120+
}
121+
122+
def __repr__(self) -> str:
123+
"""String representation of the evaluator."""
124+
return f"{self.__class__.__name__}(id='{self.id}', name='{self.name}', category={self.category.name})"
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
from typing import Any, Dict
2+
3+
from .._models import EvaluatorCategory, EvaluatorType
4+
from ._agent_scorer_evaluator import AgentScorerEvaluator
5+
from ._deterministic_evaluator import DeterministicEvaluator
6+
from ._evaluator_base import EvaluatorBase, EvaluatorBaseParams
7+
from ._llm_as_judge_evaluator import LlmAsAJudgeEvaluator
8+
from ._trajectory_evaluator import TrajectoryEvaluator
9+
10+
11+
class EvaluatorFactory:
12+
"""Factory class for creating evaluator instances based on configuration."""
13+
14+
@staticmethod
15+
def create_evaluator(data: Dict[str, Any]) -> EvaluatorBase:
16+
"""Create an evaluator instance from configuration data.
17+
18+
Args:
19+
data: Dictionary containing evaluator configuration from JSON file
20+
21+
Returns:
22+
Appropriate evaluator instance based on category
23+
24+
Raises:
25+
ValueError: If category is unknown or required fields are missing
26+
"""
27+
# Extract common fields
28+
evaluator_id = data.get("id")
29+
if not evaluator_id:
30+
raise ValueError("Evaluator configuration must include 'id' field")
31+
32+
category = EvaluatorCategory.from_int(data.get("category"))
33+
evaluator_type = EvaluatorType.from_int(data.get("type", EvaluatorType.Unknown))
34+
name = data.get("name", "")
35+
description = data.get("description", "")
36+
created_at = data.get("createdAt", "")
37+
updated_at = data.get("updatedAt", "")
38+
target_output_key = data.get("targetOutputKey", "")
39+
40+
# Create base parameters
41+
base_params = EvaluatorBaseParams(
42+
evaluator_id=evaluator_id,
43+
category=category,
44+
evaluator_type=evaluator_type,
45+
name=name,
46+
description=description,
47+
created_at=created_at,
48+
updated_at=updated_at,
49+
target_output_key=target_output_key,
50+
)
51+
52+
# Create evaluator based on category
53+
if category == EvaluatorCategory.Deterministic:
54+
return EvaluatorFactory._create_deterministic_evaluator(base_params, data)
55+
elif category == EvaluatorCategory.LlmAsAJudge:
56+
return EvaluatorFactory._create_llm_as_judge_evaluator(base_params, data)
57+
elif category == EvaluatorCategory.AgentScorer:
58+
return EvaluatorFactory._create_agent_scorer_evaluator(base_params, data)
59+
elif category == EvaluatorCategory.Trajectory:
60+
return EvaluatorFactory._create_trajectory_evaluator(base_params, data)
61+
else:
62+
raise ValueError(f"Unknown evaluator category: {category}")
63+
64+
@staticmethod
65+
def _create_deterministic_evaluator(
66+
base_params: EvaluatorBaseParams, data: Dict[str, Any]
67+
) -> DeterministicEvaluator:
68+
"""Create a deterministic evaluator."""
69+
raise NotImplementedError()
70+
71+
@staticmethod
72+
def _create_llm_as_judge_evaluator(
73+
base_params: EvaluatorBaseParams, data: Dict[str, Any]
74+
) -> LlmAsAJudgeEvaluator:
75+
"""Create an LLM-as-a-judge evaluator."""
76+
prompt = data.get("prompt", "")
77+
if not prompt:
78+
raise ValueError("LLM evaluator must include 'prompt' field")
79+
80+
model = data.get("model", "")
81+
if not model:
82+
raise ValueError("LLM evaluator must include 'model' field")
83+
84+
return LlmAsAJudgeEvaluator.from_params(
85+
base_params,
86+
prompt=prompt,
87+
model=model,
88+
target_output_key=data.get("targetOutputKey", "*"),
89+
)
90+
91+
@staticmethod
92+
def _create_agent_scorer_evaluator(
93+
base_params: EvaluatorBaseParams, data: Dict[str, Any]
94+
) -> AgentScorerEvaluator:
95+
"""Create an agent scorer evaluator."""
96+
raise NotImplementedError()
97+
98+
@staticmethod
99+
def _create_trajectory_evaluator(
100+
base_params: EvaluatorBaseParams, data: Dict[str, Any]
101+
) -> TrajectoryEvaluator:
102+
"""Create a trajectory evaluator."""
103+
raise NotImplementedError()

0 commit comments

Comments
 (0)