|
| 1 | +"""Model for evaluation runs.""" |
| 2 | + |
| 3 | +import os |
| 4 | +from dataclasses import dataclass, field |
| 5 | +from datetime import datetime |
| 6 | +from typing import TYPE_CHECKING, Any |
| 7 | +from uuid import uuid4 |
| 8 | + |
| 9 | +from rich.text import Text |
| 10 | +from uipath.runtime.errors import UiPathErrorContract |
| 11 | + |
| 12 | +if TYPE_CHECKING: |
| 13 | + from uipath.dev.models.messages import LogMessage, TraceMessage |
| 14 | + |
| 15 | + |
| 16 | +@dataclass |
| 17 | +class EvaluatorResult: |
| 18 | + """Result from a single evaluator for a single evaluation.""" |
| 19 | + |
| 20 | + evaluator_id: str |
| 21 | + evaluator_name: str |
| 22 | + score: float |
| 23 | + details: str = "" |
| 24 | + evaluation_time: float = 0.0 |
| 25 | + justification: str = "" |
| 26 | + |
| 27 | + |
| 28 | +@dataclass |
| 29 | +class EvaluationResult: |
| 30 | + """Result for a single evaluation.""" |
| 31 | + |
| 32 | + eval_id: str |
| 33 | + eval_name: str |
| 34 | + evaluator_results: list[EvaluatorResult] = field(default_factory=list) |
| 35 | + |
| 36 | + @property |
| 37 | + def passed(self) -> bool: |
| 38 | + """Check if all evaluators passed (score == 1.0).""" |
| 39 | + return all(r.score == 1.0 for r in self.evaluator_results) |
| 40 | + |
| 41 | + |
| 42 | +class EvalRun: |
| 43 | + """A single evaluation run.""" |
| 44 | + |
| 45 | + def __init__( |
| 46 | + self, |
| 47 | + eval_set_path: str, |
| 48 | + entrypoint: str, |
| 49 | + *, |
| 50 | + id: str | None = None, |
| 51 | + name: str = "", |
| 52 | + no_report: bool = False, |
| 53 | + workers: int = 1, |
| 54 | + eval_set_run_id: str | None = None, |
| 55 | + enable_mocker_cache: bool = False, |
| 56 | + eval_ids: list[str] | None = None, |
| 57 | + report_coverage: bool = False, |
| 58 | + output_file: str | None = None, |
| 59 | + # For deserialization |
| 60 | + status: str = "pending", |
| 61 | + start_time: datetime | None = None, |
| 62 | + end_time: datetime | None = None, |
| 63 | + evaluator_refs: list[str] | None = None, |
| 64 | + error: UiPathErrorContract | None = None, |
| 65 | + ): |
| 66 | + """Initialize an EvalRun instance.""" |
| 67 | + self.id = id if id is not None else str(uuid4())[:8] |
| 68 | + self.eval_set_path = eval_set_path |
| 69 | + self.entrypoint = entrypoint |
| 70 | + self.name = name if name else f"Run: {self.id}" |
| 71 | + self.status = status # pending, running, completed, failed |
| 72 | + self.start_time = start_time if start_time is not None else datetime.now() |
| 73 | + self.end_time = end_time |
| 74 | + self.evaluator_refs: list[str] = evaluator_refs if evaluator_refs is not None else [] |
| 75 | + self.evaluation_results: list[EvaluationResult] = [] |
| 76 | + self.error = error |
| 77 | + self.logs: list["LogMessage"] = [] |
| 78 | + self.traces: list["TraceMessage"] = [] |
| 79 | + # Execution options |
| 80 | + self.no_report = no_report |
| 81 | + self.workers = workers |
| 82 | + self.eval_set_run_id = eval_set_run_id |
| 83 | + self.enable_mocker_cache = enable_mocker_cache |
| 84 | + self.eval_ids: list[str] = eval_ids if eval_ids is not None else [] |
| 85 | + self.report_coverage = report_coverage |
| 86 | + self.output_file = output_file |
| 87 | + |
| 88 | + @property |
| 89 | + def duration(self) -> str: |
| 90 | + """Get the duration of the run as a formatted string.""" |
| 91 | + if self.end_time: |
| 92 | + delta = self.end_time - self.start_time |
| 93 | + return f"{delta.total_seconds():.1f}s" |
| 94 | + elif self.start_time: |
| 95 | + delta = datetime.now() - self.start_time |
| 96 | + return f"{delta.total_seconds():.1f}s" |
| 97 | + return "0.0s" |
| 98 | + |
| 99 | + @property |
| 100 | + def display_name(self) -> Text: |
| 101 | + """Get formatted display name with status indicator.""" |
| 102 | + status_colors = { |
| 103 | + "pending": "grey50", |
| 104 | + "running": "yellow", |
| 105 | + "completed": "green", |
| 106 | + "failed": "red", |
| 107 | + } |
| 108 | + |
| 109 | + status_icon = { |
| 110 | + "pending": "●", |
| 111 | + "running": "▶", |
| 112 | + "completed": "✔", |
| 113 | + "failed": "✖", |
| 114 | + }.get(self.status, "?") |
| 115 | + |
| 116 | + eval_set_name = ( |
| 117 | + os.path.basename(self.eval_set_path).rsplit(".", 1)[0] |
| 118 | + if self.eval_set_path |
| 119 | + else "eval" |
| 120 | + ) |
| 121 | + truncated_name = eval_set_name[:8] |
| 122 | + time_str = self.start_time.strftime("%H:%M:%S") |
| 123 | + duration_str = self.duration[:6] |
| 124 | + |
| 125 | + text = Text() |
| 126 | + text.append(f"{status_icon:<2} ", style=status_colors.get(self.status, "white")) |
| 127 | + text.append(f"{truncated_name:<8} ") |
| 128 | + text.append(f"({time_str:<8}) ") |
| 129 | + text.append(f"[{duration_str:<6}]") |
| 130 | + |
| 131 | + return text |
| 132 | + |
| 133 | + @property |
| 134 | + def total_evaluations(self) -> int: |
| 135 | + """Get total number of evaluations.""" |
| 136 | + return len(self.evaluation_results) |
| 137 | + |
| 138 | + @property |
| 139 | + def evaluator_scores(self) -> dict[str, float]: |
| 140 | + """Get average score per evaluator across all evaluations.""" |
| 141 | + scores: dict[str, list[float]] = {} |
| 142 | + for eval_result in self.evaluation_results: |
| 143 | + for ev_result in eval_result.evaluator_results: |
| 144 | + if ev_result.evaluator_id not in scores: |
| 145 | + scores[ev_result.evaluator_id] = [] |
| 146 | + scores[ev_result.evaluator_id].append(ev_result.score) |
| 147 | + |
| 148 | + return { |
| 149 | + ev_id: sum(s) / len(s) if s else 0.0 |
| 150 | + for ev_id, s in scores.items() |
| 151 | + } |
| 152 | + |
| 153 | + @property |
| 154 | + def overall_score(self) -> float: |
| 155 | + """Get overall average score.""" |
| 156 | + all_scores = [] |
| 157 | + for eval_result in self.evaluation_results: |
| 158 | + for ev_result in eval_result.evaluator_results: |
| 159 | + all_scores.append(ev_result.score) |
| 160 | + return sum(all_scores) / len(all_scores) if all_scores else 0.0 |
| 161 | + |
| 162 | + def to_dict(self) -> dict[str, Any]: |
| 163 | + """Convert to dictionary for serialization.""" |
| 164 | + return { |
| 165 | + "id": self.id, |
| 166 | + "name": self.name, |
| 167 | + "eval_set_path": self.eval_set_path, |
| 168 | + "entrypoint": self.entrypoint, |
| 169 | + "status": self.status, |
| 170 | + "start_time": self.start_time.isoformat(), |
| 171 | + "end_time": self.end_time.isoformat() if self.end_time else None, |
| 172 | + "evaluator_refs": self.evaluator_refs, |
| 173 | + "evaluation_results": [ |
| 174 | + { |
| 175 | + "eval_id": er.eval_id, |
| 176 | + "eval_name": er.eval_name, |
| 177 | + "evaluator_results": [ |
| 178 | + { |
| 179 | + "evaluator_id": evr.evaluator_id, |
| 180 | + "evaluator_name": evr.evaluator_name, |
| 181 | + "score": evr.score, |
| 182 | + "details": evr.details, |
| 183 | + "evaluation_time": evr.evaluation_time, |
| 184 | + "justification": evr.justification, |
| 185 | + } |
| 186 | + for evr in er.evaluator_results |
| 187 | + ], |
| 188 | + } |
| 189 | + for er in self.evaluation_results |
| 190 | + ], |
| 191 | + "error": self.error.to_dict() if self.error else None, |
| 192 | + # Execution options |
| 193 | + "no_report": self.no_report, |
| 194 | + "workers": self.workers, |
| 195 | + "eval_set_run_id": self.eval_set_run_id, |
| 196 | + "enable_mocker_cache": self.enable_mocker_cache, |
| 197 | + "eval_ids": self.eval_ids, |
| 198 | + "report_coverage": self.report_coverage, |
| 199 | + "output_file": self.output_file, |
| 200 | + } |
| 201 | + |
| 202 | + @classmethod |
| 203 | + def from_dict(cls, data: dict[str, Any]) -> "EvalRun": |
| 204 | + """Create from dictionary.""" |
| 205 | + error_data = data.get("error") |
| 206 | + error = UiPathErrorContract.from_dict(error_data) if error_data else None |
| 207 | + |
| 208 | + eval_run = cls( |
| 209 | + id=data["id"], |
| 210 | + name=data.get("name", ""), |
| 211 | + eval_set_path=data["eval_set_path"], |
| 212 | + entrypoint=data["entrypoint"], |
| 213 | + status=data.get("status", "pending"), |
| 214 | + start_time=datetime.fromisoformat(data["start_time"]) if data.get("start_time") else None, |
| 215 | + end_time=datetime.fromisoformat(data["end_time"]) if data.get("end_time") else None, |
| 216 | + evaluator_refs=data.get("evaluator_refs", []), |
| 217 | + error=error, |
| 218 | + # Execution options |
| 219 | + no_report=data.get("no_report", False), |
| 220 | + workers=data.get("workers", 1), |
| 221 | + eval_set_run_id=data.get("eval_set_run_id"), |
| 222 | + enable_mocker_cache=data.get("enable_mocker_cache", False), |
| 223 | + eval_ids=data.get("eval_ids", []), |
| 224 | + report_coverage=data.get("report_coverage", False), |
| 225 | + output_file=data.get("output_file"), |
| 226 | + ) |
| 227 | + |
| 228 | + # Parse evaluation results |
| 229 | + for er_data in data.get("evaluation_results", []): |
| 230 | + eval_result = EvaluationResult( |
| 231 | + eval_id=er_data["eval_id"], |
| 232 | + eval_name=er_data.get("eval_name", er_data["eval_id"]), |
| 233 | + ) |
| 234 | + for evr_data in er_data.get("evaluator_results", []): |
| 235 | + eval_result.evaluator_results.append( |
| 236 | + EvaluatorResult( |
| 237 | + evaluator_id=evr_data["evaluator_id"], |
| 238 | + evaluator_name=evr_data.get("evaluator_name", evr_data["evaluator_id"]), |
| 239 | + score=evr_data.get("score", 0.0), |
| 240 | + details=evr_data.get("details", ""), |
| 241 | + evaluation_time=evr_data.get("evaluation_time", 0.0), |
| 242 | + justification=evr_data.get("justification", ""), |
| 243 | + ) |
| 244 | + ) |
| 245 | + eval_run.evaluation_results.append(eval_result) |
| 246 | + |
| 247 | + return eval_run |
0 commit comments