Skip to content

Commit 29852f4

Browse files
committed
feat: add evaluations
1 parent 0b2648a commit 29852f4

27 files changed

+5625
-259
lines changed

src/uipath/dev/__init__.py

Lines changed: 1021 additions & 43 deletions
Large diffs are not rendered by default.

src/uipath/dev/models/__init__.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,22 @@
11
"""UiPath Dev Console models module."""
22

3+
from uipath.dev.models.eval_run import EvalRun, EvaluationResult, EvaluatorResult
4+
from uipath.dev.models.evaluator_types import (
5+
EVALUATOR_TYPES,
6+
get_evaluator_type,
7+
)
38
from uipath.dev.models.execution import ExecutionMode, ExecutionRun
49
from uipath.dev.models.messages import ChatMessage, LogMessage, TraceMessage
510

611
__all__ = [
7-
"ExecutionRun",
8-
"ExecutionMode",
12+
"EVALUATOR_TYPES",
913
"ChatMessage",
14+
"EvalRun",
15+
"EvaluationResult",
16+
"EvaluatorResult",
17+
"ExecutionMode",
18+
"ExecutionRun",
1019
"LogMessage",
1120
"TraceMessage",
21+
"get_evaluator_type",
1222
]

src/uipath/dev/models/eval_run.py

Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
"""Model for evaluation runs."""
2+
3+
import os
4+
from dataclasses import dataclass, field
5+
from datetime import datetime
6+
from typing import TYPE_CHECKING, Any
7+
from uuid import uuid4
8+
9+
from rich.text import Text
10+
from uipath.runtime.errors import UiPathErrorContract
11+
12+
if TYPE_CHECKING:
13+
from uipath.dev.models.messages import LogMessage, TraceMessage
14+
15+
16+
@dataclass
17+
class EvaluatorResult:
18+
"""Result from a single evaluator for a single evaluation."""
19+
20+
evaluator_id: str
21+
evaluator_name: str
22+
score: float
23+
details: str = ""
24+
evaluation_time: float = 0.0
25+
justification: str = ""
26+
27+
28+
@dataclass
29+
class EvaluationResult:
30+
"""Result for a single evaluation."""
31+
32+
eval_id: str
33+
eval_name: str
34+
evaluator_results: list[EvaluatorResult] = field(default_factory=list)
35+
36+
@property
37+
def passed(self) -> bool:
38+
"""Check if all evaluators passed (score == 1.0)."""
39+
return all(r.score == 1.0 for r in self.evaluator_results)
40+
41+
42+
class EvalRun:
43+
"""A single evaluation run."""
44+
45+
def __init__(
46+
self,
47+
eval_set_path: str,
48+
entrypoint: str,
49+
*,
50+
id: str | None = None,
51+
name: str = "",
52+
no_report: bool = False,
53+
workers: int = 1,
54+
eval_set_run_id: str | None = None,
55+
enable_mocker_cache: bool = False,
56+
eval_ids: list[str] | None = None,
57+
report_coverage: bool = False,
58+
output_file: str | None = None,
59+
# For deserialization
60+
status: str = "pending",
61+
start_time: datetime | None = None,
62+
end_time: datetime | None = None,
63+
evaluator_refs: list[str] | None = None,
64+
error: UiPathErrorContract | None = None,
65+
):
66+
"""Initialize an EvalRun instance."""
67+
self.id = id if id is not None else str(uuid4())[:8]
68+
self.eval_set_path = eval_set_path
69+
self.entrypoint = entrypoint
70+
self.name = name if name else f"Run: {self.id}"
71+
self.status = status # pending, running, completed, failed
72+
self.start_time = start_time if start_time is not None else datetime.now()
73+
self.end_time = end_time
74+
self.evaluator_refs: list[str] = evaluator_refs if evaluator_refs is not None else []
75+
self.evaluation_results: list[EvaluationResult] = []
76+
self.error = error
77+
self.logs: list["LogMessage"] = []
78+
self.traces: list["TraceMessage"] = []
79+
# Execution options
80+
self.no_report = no_report
81+
self.workers = workers
82+
self.eval_set_run_id = eval_set_run_id
83+
self.enable_mocker_cache = enable_mocker_cache
84+
self.eval_ids: list[str] = eval_ids if eval_ids is not None else []
85+
self.report_coverage = report_coverage
86+
self.output_file = output_file
87+
88+
@property
89+
def duration(self) -> str:
90+
"""Get the duration of the run as a formatted string."""
91+
if self.end_time:
92+
delta = self.end_time - self.start_time
93+
return f"{delta.total_seconds():.1f}s"
94+
elif self.start_time:
95+
delta = datetime.now() - self.start_time
96+
return f"{delta.total_seconds():.1f}s"
97+
return "0.0s"
98+
99+
@property
100+
def display_name(self) -> Text:
101+
"""Get formatted display name with status indicator."""
102+
status_colors = {
103+
"pending": "grey50",
104+
"running": "yellow",
105+
"completed": "green",
106+
"failed": "red",
107+
}
108+
109+
status_icon = {
110+
"pending": "●",
111+
"running": "▶",
112+
"completed": "✔",
113+
"failed": "✖",
114+
}.get(self.status, "?")
115+
116+
eval_set_name = (
117+
os.path.basename(self.eval_set_path).rsplit(".", 1)[0]
118+
if self.eval_set_path
119+
else "eval"
120+
)
121+
truncated_name = eval_set_name[:8]
122+
time_str = self.start_time.strftime("%H:%M:%S")
123+
duration_str = self.duration[:6]
124+
125+
text = Text()
126+
text.append(f"{status_icon:<2} ", style=status_colors.get(self.status, "white"))
127+
text.append(f"{truncated_name:<8} ")
128+
text.append(f"({time_str:<8}) ")
129+
text.append(f"[{duration_str:<6}]")
130+
131+
return text
132+
133+
@property
134+
def total_evaluations(self) -> int:
135+
"""Get total number of evaluations."""
136+
return len(self.evaluation_results)
137+
138+
@property
139+
def evaluator_scores(self) -> dict[str, float]:
140+
"""Get average score per evaluator across all evaluations."""
141+
scores: dict[str, list[float]] = {}
142+
for eval_result in self.evaluation_results:
143+
for ev_result in eval_result.evaluator_results:
144+
if ev_result.evaluator_id not in scores:
145+
scores[ev_result.evaluator_id] = []
146+
scores[ev_result.evaluator_id].append(ev_result.score)
147+
148+
return {
149+
ev_id: sum(s) / len(s) if s else 0.0
150+
for ev_id, s in scores.items()
151+
}
152+
153+
@property
154+
def overall_score(self) -> float:
155+
"""Get overall average score."""
156+
all_scores = []
157+
for eval_result in self.evaluation_results:
158+
for ev_result in eval_result.evaluator_results:
159+
all_scores.append(ev_result.score)
160+
return sum(all_scores) / len(all_scores) if all_scores else 0.0
161+
162+
def to_dict(self) -> dict[str, Any]:
163+
"""Convert to dictionary for serialization."""
164+
return {
165+
"id": self.id,
166+
"name": self.name,
167+
"eval_set_path": self.eval_set_path,
168+
"entrypoint": self.entrypoint,
169+
"status": self.status,
170+
"start_time": self.start_time.isoformat(),
171+
"end_time": self.end_time.isoformat() if self.end_time else None,
172+
"evaluator_refs": self.evaluator_refs,
173+
"evaluation_results": [
174+
{
175+
"eval_id": er.eval_id,
176+
"eval_name": er.eval_name,
177+
"evaluator_results": [
178+
{
179+
"evaluator_id": evr.evaluator_id,
180+
"evaluator_name": evr.evaluator_name,
181+
"score": evr.score,
182+
"details": evr.details,
183+
"evaluation_time": evr.evaluation_time,
184+
"justification": evr.justification,
185+
}
186+
for evr in er.evaluator_results
187+
],
188+
}
189+
for er in self.evaluation_results
190+
],
191+
"error": self.error.to_dict() if self.error else None,
192+
# Execution options
193+
"no_report": self.no_report,
194+
"workers": self.workers,
195+
"eval_set_run_id": self.eval_set_run_id,
196+
"enable_mocker_cache": self.enable_mocker_cache,
197+
"eval_ids": self.eval_ids,
198+
"report_coverage": self.report_coverage,
199+
"output_file": self.output_file,
200+
}
201+
202+
@classmethod
203+
def from_dict(cls, data: dict[str, Any]) -> "EvalRun":
204+
"""Create from dictionary."""
205+
error_data = data.get("error")
206+
error = UiPathErrorContract.from_dict(error_data) if error_data else None
207+
208+
eval_run = cls(
209+
id=data["id"],
210+
name=data.get("name", ""),
211+
eval_set_path=data["eval_set_path"],
212+
entrypoint=data["entrypoint"],
213+
status=data.get("status", "pending"),
214+
start_time=datetime.fromisoformat(data["start_time"]) if data.get("start_time") else None,
215+
end_time=datetime.fromisoformat(data["end_time"]) if data.get("end_time") else None,
216+
evaluator_refs=data.get("evaluator_refs", []),
217+
error=error,
218+
# Execution options
219+
no_report=data.get("no_report", False),
220+
workers=data.get("workers", 1),
221+
eval_set_run_id=data.get("eval_set_run_id"),
222+
enable_mocker_cache=data.get("enable_mocker_cache", False),
223+
eval_ids=data.get("eval_ids", []),
224+
report_coverage=data.get("report_coverage", False),
225+
output_file=data.get("output_file"),
226+
)
227+
228+
# Parse evaluation results
229+
for er_data in data.get("evaluation_results", []):
230+
eval_result = EvaluationResult(
231+
eval_id=er_data["eval_id"],
232+
eval_name=er_data.get("eval_name", er_data["eval_id"]),
233+
)
234+
for evr_data in er_data.get("evaluator_results", []):
235+
eval_result.evaluator_results.append(
236+
EvaluatorResult(
237+
evaluator_id=evr_data["evaluator_id"],
238+
evaluator_name=evr_data.get("evaluator_name", evr_data["evaluator_id"]),
239+
score=evr_data.get("score", 0.0),
240+
details=evr_data.get("details", ""),
241+
evaluation_time=evr_data.get("evaluation_time", 0.0),
242+
justification=evr_data.get("justification", ""),
243+
)
244+
)
245+
eval_run.evaluation_results.append(eval_result)
246+
247+
return eval_run

0 commit comments

Comments
 (0)