Skip to content

Commit e59bd33

Browse files
authored
Merge pull request #1000 from UiPath/mj/fix-user-faults
fix: user faults should be reported as failed eval runs
2 parents 753ec56 + d18d27f commit e59bd33

File tree

7 files changed

+234
-2
lines changed

7 files changed

+234
-2
lines changed
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
{
2+
"version": "1.0",
3+
"id": "CrashScenarios",
4+
"name": "Crash Scenarios Evaluation",
5+
"evaluatorRefs": [],
6+
"evaluations": [
7+
{
8+
"id": "crash-string-input-a",
9+
"name": "Crash when input 'a' is a string",
10+
"inputs": {
11+
"a": "not a number",
12+
"b": 5,
13+
"operator": "+"
14+
},
15+
"evaluationCriterias": {}
16+
},
17+
{
18+
"id": "crash-string-input-b",
19+
"name": "Crash when input 'b' is a string",
20+
"inputs": {
21+
"a": 10,
22+
"b": "invalid",
23+
"operator": "*"
24+
},
25+
"evaluationCriterias": {}
26+
},
27+
{
28+
"id": "crash-both-inputs-strings",
29+
"name": "Crash when both inputs are strings",
30+
"inputs": {
31+
"a": "hello",
32+
"b": "world",
33+
"operator": "-"
34+
},
35+
"evaluationCriterias": {}
36+
},
37+
{
38+
"id": "crash-null-input-a",
39+
"name": "Crash when input 'a' is null",
40+
"inputs": {
41+
"a": null,
42+
"b": 25,
43+
"operator": "/"
44+
},
45+
"evaluationCriterias": {}
46+
},
47+
{
48+
"id": "crash-boolean-input",
49+
"name": "Crash when input is a boolean",
50+
"inputs": {
51+
"a": true,
52+
"b": 3,
53+
"operator": "+"
54+
},
55+
"evaluationCriterias": {}
56+
},
57+
{
58+
"id": "crash-array-input",
59+
"name": "Crash when input is an array",
60+
"inputs": {
61+
"a": [1, 2, 3],
62+
"b": 5,
63+
"operator": "*"
64+
},
65+
"evaluationCriterias": {}
66+
},
67+
{
68+
"id": "crash-object-input",
69+
"name": "Crash when input is an object",
70+
"inputs": {
71+
"a": {"value": 10},
72+
"b": 3,
73+
"operator": "-"
74+
},
75+
"evaluationCriterias": {}
76+
}
77+
]
78+
}

samples/calculator/main.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,9 @@ class Wrapper:
4343

4444
@traced()
4545
@mockable(example_calls=GET_RANDOM_OPERATOR_EXAMPLES)
46-
async def get_random_operator() -> Wrapper: # pragma: no cover (since eval mocks this function, ignore it from coverage!)
46+
async def get_random_operator() -> (
47+
Wrapper
48+
): # pragma: no cover (since eval mocks this function, ignore it from coverage!)
4749
"""Get a random operator."""
4850
return Wrapper(
4951
result=random.choice(

src/uipath/_cli/_evals/_runtime.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -476,17 +476,33 @@ async def _execute_eval(
476476
)
477477
)
478478

479+
exception_details = None
480+
agent_output = agent_execution_output.result.output
481+
if agent_execution_output.result.status == UiPathRuntimeStatus.FAULTED:
482+
error = agent_execution_output.result.error
483+
if error is not None:
484+
# we set the exception details for the run event
485+
# Convert error contract to exception
486+
error_exception = Exception(
487+
f"{error.title}: {error.detail} (code: {error.code})"
488+
)
489+
exception_details = EvalItemExceptionDetails(
490+
exception=error_exception
491+
)
492+
agent_output = error.model_dump()
493+
479494
await event_bus.publish(
480495
EvaluationEvents.UPDATE_EVAL_RUN,
481496
EvalRunUpdatedEvent(
482497
execution_id=execution_id,
483498
eval_item=eval_item,
484499
eval_results=evaluation_item_results,
485500
success=not agent_execution_output.result.error,
486-
agent_output=agent_execution_output.result.output,
501+
agent_output=agent_output,
487502
agent_execution_time=agent_execution_output.execution_time,
488503
spans=agent_execution_output.spans,
489504
logs=agent_execution_output.logs,
505+
exception_details=exception_details,
490506
),
491507
wait_for_completion=False,
492508
)
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
[project]
2+
name = "calculator-agent"
3+
version = "0.0.1"
4+
description = "Calculator agent testcase with custom evaluators"
5+
authors = [{ name = "John Doe", email = "john.doe@myemail.com" }]
6+
dependencies = [
7+
"uipath",
8+
]
9+
requires-python = ">=3.11"
10+
11+
[tool.uv.sources]
12+
uipath = { path = "../../", editable = true }
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
set -e
3+
4+
echo "Syncing dependencies..."
5+
uv sync
6+
7+
echo "Authenticating with UiPath..."
8+
uv run uipath auth --client-id="$CLIENT_ID" --client-secret="$CLIENT_SECRET" --base-url="$BASE_URL"
9+
10+
echo "Running evaluations with custom evaluator..."
11+
uv run uipath eval main ../../samples/calculator/evaluations/eval-sets/crash-scenarios.json --no-report
12+
13+
echo "Test completed successfully!"
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
"""Assertions for calculator-crash-evals testcase.
2+
3+
This script validates that the calculator crash evaluations work correctly by:
4+
1. Reading the evaluation output from __uipath/output.json
5+
2. Validating that all evaluations have scores equal to 0 (since the calculator crashes)
6+
"""
7+
8+
import json
9+
import os
10+
11+
12+
def main() -> None:
13+
"""Main assertion logic."""
14+
# Check if output file exists
15+
output_file = "__uipath/output.json"
16+
17+
assert os.path.isfile(output_file), (
18+
f"Evaluation output file '{output_file}' not found"
19+
)
20+
print(f"✓ Found evaluation output file: {output_file}")
21+
22+
# Load evaluation results
23+
with open(output_file, "r", encoding="utf-8") as f:
24+
output_data = json.load(f)
25+
26+
print("✓ Loaded evaluation output")
27+
28+
# Check status
29+
status = output_data.get("status")
30+
assert status == "successful", f"Evaluation run failed with status: {status}"
31+
print("✓ Evaluation run status: successful")
32+
33+
# Extract output data
34+
output = output_data.get("output", {})
35+
36+
# Validate structure
37+
assert "evaluationSetResults" in output, "Missing 'evaluationSetResults' in output"
38+
39+
evaluation_results = output["evaluationSetResults"]
40+
assert len(evaluation_results) > 0, "No evaluation results found"
41+
42+
print(f"✓ Found {len(evaluation_results)} evaluation result(s)")
43+
44+
# Validate each evaluation result
45+
passed_count = 0
46+
failed_count = 0
47+
skipped_count = 0
48+
all_scores_zero = True
49+
50+
for eval_result in evaluation_results:
51+
eval_name = eval_result.get("evaluationName", "Unknown")
52+
print(f"\n→ Validating: {eval_name}")
53+
54+
try:
55+
# Validate evaluation results are present
56+
eval_run_results = eval_result.get("evaluationRunResults", [])
57+
if len(eval_run_results) == 0:
58+
print(f" ⊘ Skipping '{eval_name}' (no evaluation run results)")
59+
skipped_count += 1
60+
continue
61+
62+
# Check that all evaluations have scores equal to 0
63+
all_passed = True
64+
for eval_run in eval_run_results:
65+
evaluator_name = eval_run.get("evaluatorName", "Unknown")
66+
result = eval_run.get("result", {})
67+
score = result.get("score", 0)
68+
69+
# Check if score is equal to 0
70+
if score == 0:
71+
print(f" ✓ {evaluator_name}: score={score:.1f} (expected 0)")
72+
else:
73+
print(f" ✗ {evaluator_name}: score={score:.1f} (expected 0)")
74+
all_passed = False
75+
all_scores_zero = False
76+
77+
if all_passed:
78+
print(f" ✓ All evaluators passed for '{eval_name}' (all scores are 0)")
79+
passed_count += 1
80+
else:
81+
print(f" ✗ Some evaluators failed for '{eval_name}'")
82+
failed_count += 1
83+
84+
except Exception as e:
85+
print(f" ✗ Error validating '{eval_name}': {e}")
86+
failed_count += 1
87+
88+
# Final summary
89+
print(f"\n{'=' * 60}")
90+
print("Summary:")
91+
print(f" Total evaluations: {passed_count + failed_count + skipped_count}")
92+
print(f" ✓ Passed: {passed_count}")
93+
print(f" ✗ Failed: {failed_count}")
94+
print(f" ⊘ Skipped: {skipped_count}")
95+
print(f"{'=' * 60}")
96+
97+
assert failed_count == 0, "Some assertions failed"
98+
assert all_scores_zero, "Not all evaluation scores are 0 as expected"
99+
100+
print(
101+
"\n✅ All assertions passed! All scores are 0 as expected for crash scenarios."
102+
)
103+
104+
105+
if __name__ == "__main__":
106+
main()
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{
2+
"functions": {
3+
"main": "../../samples/calculator/main.py:main"
4+
}
5+
}

0 commit comments

Comments
 (0)