fix: legacy evaluation reporting with Strategy Pattern

Chibi Vikram · claude · Chibi Vikram · commit c6cd5c34f3bf · 2025-12-18T20:36:15.000-08:00
This PR fixes legacy evaluation reporting to the backend that was returning HTTP 400 errors and implements the Strategy Pattern for cleaner code separation. ## Changes ### Strategy Pattern Implementation - Created `EvalReportingStrategy` Protocol defining the interface for evaluation reporting strategies - Implemented `LegacyEvalReportingStrategy` for legacy evaluations: - Converts string IDs to deterministic GUIDs using uuid5 - Uses endpoints without /coded/ prefix - Uses assertionRuns format with assertionSnapshot - Implemented `CodedEvalReportingStrategy` for coded evaluations: - Keeps IDs as strings - Uses /coded/ endpoint prefix - Uses evaluatorRuns format with evaluationCriterias ### Bug Fixes - Fixed legacy eval API payload structure for backend compatibility - Added type assertion for project_id to fix mypy errors - Removed unused ABC, abstractmethod imports after Protocol migration ### Test Results - All 27 unit tests passing - All linting checks (ruff, mypy) passing - Integration testing with calculator sample: all API calls returning HTTP 200 OK 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/samples/calculator/evaluations/eval-sets/legacy.json b/samples/calculator/evaluations/eval-sets/legacy.json
@@ -1,17 +1,17 @@
 {
-  "fileName": "default.json",
-  "id": "default-eval-set-id",
-  "name": "Basic Calculator Evaluation Set",
+  "fileName": "legacy.json",
+  "id": "a1b2c3d4-e5f6-4a89-abcd-ef0123456789",
+  "name": "Basic Calculator Evaluation Set (Legacy)",
   "batchSize": 10,
   "evaluatorRefs": [
-    "equality",
-    "llm-as-a-judge",
-    "json-similarity",
-    "trajectory"
+    "aaaaaaaa-aaaa-4aaa-aaaa-aaaaaaaaaaaa",
+    "bbbbbbbb-bbbb-4bbb-bbbb-bbbbbbbbbbbb",
+    "cccccccc-cccc-4ccc-cccc-cccccccccccc",
+    "dddddddd-dddd-4ddd-dddd-dddddddddddd"
   ],
   "evaluations": [
     {
-      "id": "test-addition",
+      "id": "11111111-1111-4111-8111-111111111111",
       "name": "Test Addition",
       "inputs": {
         "a": 1,
@@ -22,12 +22,12 @@
         "result": 2.0
       },
       "expectedAgentBehavior": "The operation should produce the right output.",
-      "evalSetId": "default-eval-set-id",
+      "evalSetId": "a1b2c3d4-e5f6-4a89-abcd-ef0123456789",
       "createdAt": "2025-09-04T18:54:58.378Z",
       "updatedAt": "2025-09-04T18:55:55.416Z"
     },
     {
-      "id": "test-random-addition-using-llm",
+      "id": "22222222-2222-4222-8222-222222222222",
       "name": "Test Random Addition Using LLM",
       "inputs": {
         "a": 1,
@@ -45,12 +45,12 @@
             "name": "get_random_operator"
           }
       ],
-      "evalSetId": "default-eval-set-id",
+      "evalSetId": "a1b2c3d4-e5f6-4a89-abcd-ef0123456789",
       "createdAt": "2025-09-04T18:54:58.378Z",
       "updatedAt": "2025-09-04T18:55:55.416Z"
     },
     {
-      "id": "test-with-llm-input-mocking",
+      "id": "33333333-3333-4333-8333-333333333333",
       "name": "Test with LLM input mocking",
       "inputs": {},
       "expectedOutput": {
@@ -59,7 +59,7 @@
       "expectedAgentBehavior": "The operation should produce the right output.",
       "simulateInput": true,
       "inputGenerationInstructions": "Generate a multiplication calculation where the first number is 5 and the second number is 7",
-      "evalSetId": "default-eval-set-id",
+      "evalSetId": "a1b2c3d4-e5f6-4a89-abcd-ef0123456789",
       "createdAt": "2025-09-04T18:54:58.378Z",
       "updatedAt": "2025-09-04T18:55:55.416Z"
     }
diff --git a/samples/calculator/evaluations/evaluators/legacy-equality.json b/samples/calculator/evaluations/evaluators/legacy-equality.json
@@ -1,6 +1,6 @@
 {
-    "fileName": "equality.json",
-    "id": "equality",
+    "fileName": "legacy-equality.json",
+    "id": "aaaaaaaa-aaaa-4aaa-aaaa-aaaaaaaaaaaa",
     "name": "Equality Evaluator",
     "description": "An evaluator that judges the agent based on expected output.",
     "category": 0,
diff --git a/samples/calculator/evaluations/evaluators/legacy-json-similarity.json b/samples/calculator/evaluations/evaluators/legacy-json-similarity.json
@@ -1,6 +1,6 @@
 {
-    "fileName": "json-similarity.json",
-    "id": "json-similarity",
+    "fileName": "legacy-json-similarity.json",
+    "id": "cccccccc-cccc-4ccc-cccc-cccccccccccc",
     "name": "JSON Similarity Evaluator",
     "description": "An evaluator that compares JSON structures with tolerance for numeric and string differences.",
     "category": 0,
diff --git a/samples/calculator/evaluations/evaluators/legacy-llm-as-a-judge.json b/samples/calculator/evaluations/evaluators/legacy-llm-as-a-judge.json
@@ -1,6 +1,6 @@
 {
-  "fileName": "llm-as-a-judge.json",
-  "id": "llm-as-a-judge",
+  "fileName": "legacy-llm-as-a-judge.json",
+  "id": "bbbbbbbb-bbbb-4bbb-bbbb-bbbbbbbbbbbb",
   "name": "LLMAsAJudge Evaluator",
   "description": "An evaluator that judges the agent based on it's run history and expected behavior",
   "category": 3,
diff --git a/samples/calculator/evaluations/evaluators/legacy-trajectory.json b/samples/calculator/evaluations/evaluators/legacy-trajectory.json
@@ -1,6 +1,6 @@
 {
-  "fileName": "trajectory.json",
-  "id": "trajectory",
+  "fileName": "legacy-trajectory.json",
+  "id": "dddddddd-dddd-4ddd-dddd-dddddddddddd",
   "name": "Trajectory Evaluator",
   "description": "An evaluator that analyzes the execution trajectory and decision sequence taken by the agent.",
   "category": 3,
diff --git a/src/uipath/_cli/_evals/_progress_reporter.py b/src/uipath/_cli/_evals/_progress_reporter.py
diff --git a/tests/cli/eval/test_progress_reporter.py b/tests/cli/eval/test_progress_reporter.py

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`		`- "fileName": "equality.json",`
`3`		`- "id": "equality",`
	`2`	`+ "fileName": "legacy-equality.json",`
	`3`	`+ "id": "aaaaaaaa-aaaa-4aaa-aaaa-aaaaaaaaaaaa",`
`4`	`4`	`"name": "Equality Evaluator",`
`5`	`5`	`"description": "An evaluator that judges the agent based on expected output.",`
`6`	`6`	`"category": 0,`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`		`- "fileName": "json-similarity.json",`
`3`		`- "id": "json-similarity",`
	`2`	`+ "fileName": "legacy-json-similarity.json",`
	`3`	`+ "id": "cccccccc-cccc-4ccc-cccc-cccccccccccc",`
`4`	`4`	`"name": "JSON Similarity Evaluator",`
`5`	`5`	`"description": "An evaluator that compares JSON structures with tolerance for numeric and string differences.",`
`6`	`6`	`"category": 0,`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`		`- "fileName": "llm-as-a-judge.json",`
`3`		`- "id": "llm-as-a-judge",`
	`2`	`+ "fileName": "legacy-llm-as-a-judge.json",`
	`3`	`+ "id": "bbbbbbbb-bbbb-4bbb-bbbb-bbbbbbbbbbbb",`
`4`	`4`	`"name": "LLMAsAJudge Evaluator",`
`5`	`5`	`"description": "An evaluator that judges the agent based on it's run history and expected behavior",`
`6`	`6`	`"category": 3,`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`		`- "fileName": "trajectory.json",`
`3`		`- "id": "trajectory",`
	`2`	`+ "fileName": "legacy-trajectory.json",`
	`3`	`+ "id": "dddddddd-dddd-4ddd-dddd-dddddddddddd",`
`4`	`4`	`"name": "Trajectory Evaluator",`
`5`	`5`	`"description": "An evaluator that analyzes the execution trajectory and decision sequence taken by the agent.",`
`6`	`6`	`"category": 3,`