Merge pull request #1000 from UiPath/mj/fix-user-faults

mjnovice · web-flow · commit e59bd33693ba · 2025-12-08T17:42:44.000-08:00
fix: user faults should be reported as failed eval runs
diff --git a/samples/calculator/evaluations/eval-sets/crash-scenarios.json b/samples/calculator/evaluations/eval-sets/crash-scenarios.json
@@ -0,0 +1,78 @@
+{
+  "version": "1.0",
+  "id": "CrashScenarios",
+  "name": "Crash Scenarios Evaluation",
+  "evaluatorRefs": [],
+  "evaluations": [
+    {
+      "id": "crash-string-input-a",
+      "name": "Crash when input 'a' is a string",
+      "inputs": {
+        "a": "not a number",
+        "b": 5,
+        "operator": "+"
+      },
+      "evaluationCriterias": {}
+    },
+    {
+      "id": "crash-string-input-b",
+      "name": "Crash when input 'b' is a string",
+      "inputs": {
+        "a": 10,
+        "b": "invalid",
+        "operator": "*"
+      },
+      "evaluationCriterias": {}
+    },
+    {
+      "id": "crash-both-inputs-strings",
+      "name": "Crash when both inputs are strings",
+      "inputs": {
+        "a": "hello",
+        "b": "world",
+        "operator": "-"
+      },
+      "evaluationCriterias": {}
+    },
+    {
+      "id": "crash-null-input-a",
+      "name": "Crash when input 'a' is null",
+      "inputs": {
+        "a": null,
+        "b": 25,
+        "operator": "/"
+      },
+      "evaluationCriterias": {}
+    },
+    {
+      "id": "crash-boolean-input",
+      "name": "Crash when input is a boolean",
+      "inputs": {
+        "a": true,
+        "b": 3,
+        "operator": "+"
+      },
+      "evaluationCriterias": {}
+    },
+    {
+      "id": "crash-array-input",
+      "name": "Crash when input is an array",
+      "inputs": {
+        "a": [1, 2, 3],
+        "b": 5,
+        "operator": "*"
+      },
+      "evaluationCriterias": {}
+    },
+    {
+      "id": "crash-object-input",
+      "name": "Crash when input is an object",
+      "inputs": {
+        "a": {"value": 10},
+        "b": 3,
+        "operator": "-"
+      },
+      "evaluationCriterias": {}
+    }
+  ]
+}
diff --git a/samples/calculator/main.py b/samples/calculator/main.py
@@ -43,7 +43,9 @@ class Wrapper:
 
 @traced()
 @mockable(example_calls=GET_RANDOM_OPERATOR_EXAMPLES)
-async def get_random_operator() -> Wrapper: # pragma: no cover (since eval mocks this function, ignore it from coverage!)
+async def get_random_operator() -> (
+    Wrapper
+):  # pragma: no cover (since eval mocks this function, ignore it from coverage!)
     """Get a random operator."""
     return Wrapper(
         result=random.choice(
diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py
@@ -476,17 +476,33 @@ async def _execute_eval(
                     )
                 )
 
+            exception_details = None
+            agent_output = agent_execution_output.result.output
+            if agent_execution_output.result.status == UiPathRuntimeStatus.FAULTED:
+                error = agent_execution_output.result.error
+                if error is not None:
+                    # we set the exception details for the run event
+                    # Convert error contract to exception
+                    error_exception = Exception(
+                        f"{error.title}: {error.detail} (code: {error.code})"
+                    )
+                    exception_details = EvalItemExceptionDetails(
+                        exception=error_exception
+                    )
+                    agent_output = error.model_dump()
+
             await event_bus.publish(
                 EvaluationEvents.UPDATE_EVAL_RUN,
                 EvalRunUpdatedEvent(
                     execution_id=execution_id,
                     eval_item=eval_item,
                     eval_results=evaluation_item_results,
                     success=not agent_execution_output.result.error,
-                    agent_output=agent_execution_output.result.output,
+                    agent_output=agent_output,
                     agent_execution_time=agent_execution_output.execution_time,
                     spans=agent_execution_output.spans,
                     logs=agent_execution_output.logs,
+                    exception_details=exception_details,
                 ),
                 wait_for_completion=False,
             )
diff --git a/testcases/calculator-crash-evals/pyproject.toml b/testcases/calculator-crash-evals/pyproject.toml
@@ -0,0 +1,12 @@
+[project]
+name = "calculator-agent"
+version = "0.0.1"
+description = "Calculator agent testcase with custom evaluators"
+authors = [{ name = "John Doe", email = "john.doe@myemail.com" }]
+dependencies = [
+    "uipath",
+]
+requires-python = ">=3.11"
+
+[tool.uv.sources]
+uipath = { path = "../../", editable = true }
diff --git a/testcases/calculator-crash-evals/run.sh b/testcases/calculator-crash-evals/run.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+set -e
+
+echo "Syncing dependencies..."
+uv sync
+
+echo "Authenticating with UiPath..."
+uv run uipath auth --client-id="$CLIENT_ID" --client-secret="$CLIENT_SECRET" --base-url="$BASE_URL"
+
+echo "Running evaluations with custom evaluator..."
+uv run uipath eval main ../../samples/calculator/evaluations/eval-sets/crash-scenarios.json --no-report
+
+echo "Test completed successfully!"
diff --git a/testcases/calculator-crash-evals/src/assert.py b/testcases/calculator-crash-evals/src/assert.py
@@ -0,0 +1,106 @@
+"""Assertions for calculator-crash-evals testcase.
+
+This script validates that the calculator crash evaluations work correctly by:
+1. Reading the evaluation output from __uipath/output.json
+2. Validating that all evaluations have scores equal to 0 (since the calculator crashes)
+"""
+
+import json
+import os
+
+
+def main() -> None:
+    """Main assertion logic."""
+    # Check if output file exists
+    output_file = "__uipath/output.json"
+
+    assert os.path.isfile(output_file), (
+        f"Evaluation output file '{output_file}' not found"
+    )
+    print(f"✓ Found evaluation output file: {output_file}")
+
+    # Load evaluation results
+    with open(output_file, "r", encoding="utf-8") as f:
+        output_data = json.load(f)
+
+    print("✓ Loaded evaluation output")
+
+    # Check status
+    status = output_data.get("status")
+    assert status == "successful", f"Evaluation run failed with status: {status}"
+    print("✓ Evaluation run status: successful")
+
+    # Extract output data
+    output = output_data.get("output", {})
+
+    # Validate structure
+    assert "evaluationSetResults" in output, "Missing 'evaluationSetResults' in output"
+
+    evaluation_results = output["evaluationSetResults"]
+    assert len(evaluation_results) > 0, "No evaluation results found"
+
+    print(f"✓ Found {len(evaluation_results)} evaluation result(s)")
+
+    # Validate each evaluation result
+    passed_count = 0
+    failed_count = 0
+    skipped_count = 0
+    all_scores_zero = True
+
+    for eval_result in evaluation_results:
+        eval_name = eval_result.get("evaluationName", "Unknown")
+        print(f"\n→ Validating: {eval_name}")
+
+        try:
+            # Validate evaluation results are present
+            eval_run_results = eval_result.get("evaluationRunResults", [])
+            if len(eval_run_results) == 0:
+                print(f"  ⊘ Skipping '{eval_name}' (no evaluation run results)")
+                skipped_count += 1
+                continue
+
+            # Check that all evaluations have scores equal to 0
+            all_passed = True
+            for eval_run in eval_run_results:
+                evaluator_name = eval_run.get("evaluatorName", "Unknown")
+                result = eval_run.get("result", {})
+                score = result.get("score", 0)
+
+                # Check if score is equal to 0
+                if score == 0:
+                    print(f"  ✓ {evaluator_name}: score={score:.1f} (expected 0)")
+                else:
+                    print(f"  ✗ {evaluator_name}: score={score:.1f} (expected 0)")
+                    all_passed = False
+                    all_scores_zero = False
+
+            if all_passed:
+                print(f"  ✓ All evaluators passed for '{eval_name}' (all scores are 0)")
+                passed_count += 1
+            else:
+                print(f"  ✗ Some evaluators failed for '{eval_name}'")
+                failed_count += 1
+
+        except Exception as e:
+            print(f"  ✗ Error validating '{eval_name}': {e}")
+            failed_count += 1
+
+    # Final summary
+    print(f"\n{'=' * 60}")
+    print("Summary:")
+    print(f"  Total evaluations: {passed_count + failed_count + skipped_count}")
+    print(f"  ✓ Passed: {passed_count}")
+    print(f"  ✗ Failed: {failed_count}")
+    print(f"  ⊘ Skipped: {skipped_count}")
+    print(f"{'=' * 60}")
+
+    assert failed_count == 0, "Some assertions failed"
+    assert all_scores_zero, "Not all evaluation scores are 0 as expected"
+
+    print(
+        "\n✅ All assertions passed! All scores are 0 as expected for crash scenarios."
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/testcases/calculator-crash-evals/uipath.json b/testcases/calculator-crash-evals/uipath.json
@@ -0,0 +1,5 @@
+{
+  "functions": {
+    "main": "../../samples/calculator/main.py:main"
+  }
+}

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +{
 +  "functions": {
 +    "main": "../../samples/calculator/main.py:main"
 +  }
 +}