[Evals] task error handling and memory cleanup (#1419)

miguelg719 · web-flow · commit 8cff9ace1c61 · 2025-12-15T16:43:45.000-08:00
# why Error handling on tasks was causing the runner to idle. Also, proper log cleanup is needed to prevent memory leaks. # what changed - Remove nested try/finally around task execution, let main try/catch handle cleanup - Add cleanup to EvalLogger # test plan  --- ## Summary by cubic Improves eval task error handling and cleanup to prevent memory leaks and dangling V3 sessions. Always closes resources and clears logs after each task run. - **Bug Fixes** - Always close the V3 instance in a finally block; log close errors as warnings so they don’t mask task results. - Clear the EvalLogger after returning logs to free memory. - Track v3Input at the outer scope to ensure cleanup on both success and failure. - **Refactors** - Simplified result logging and removed nested try/finally around task execution. <sup>Written for commit 34aebe9. Summary will update automatically on new commits.</sup>
diff --git a/packages/evals/index.eval.ts b/packages/evals/index.eval.ts
@@ -271,6 +271,9 @@ const generateFilteredTestcases = (): Testcase[] => {
       // Each test is a function that runs the corresponding task module
       task: async (input: EvalInput) => {
         const logger = new EvalLogger();
+        // Track V3 instance at outer scope to ensure cleanup in all cases
+        let v3Input: Awaited<ReturnType<typeof initV3>> | undefined;
+
         try {
           // Dynamically import the task based on its name
           const taskModulePath = path.join(
@@ -323,9 +326,6 @@ const generateFilteredTestcases = (): Testcase[] => {
           }
 
           // Execute the task
-          // let taskInput: Awaited<ReturnType<typeof initStagehand>>;
-          let v3Input: Awaited<ReturnType<typeof initV3>> | undefined;
-
           const isAgentTask =
             input.name.startsWith("agent/") || input.name.includes("/agent/");
           if (USE_API) {
@@ -384,18 +384,15 @@ const generateFilteredTestcases = (): Testcase[] => {
             });
           }
           // Pass full EvalInput to the task (data-driven params available via input.params)
-          let result;
-          try {
-            result = await taskFunction({ ...v3Input, input });
-            // Log result to console
-            if (result && result._success) {
-              console.log(`✅ ${input.name}: Passed`);
-            } else {
-              console.log(`❌ ${input.name}: Failed`);
-            }
-          } finally {
-            if (v3Input?.v3) await v3Input.v3.close();
+          const result = await taskFunction({ ...v3Input, input });
+
+          // Log result to console
+          if (result && result._success) {
+            console.log(`✅ ${input.name}: Passed`);
+          } else {
+            console.log(`❌ ${input.name}: Failed`);
           }
+
           return result;
         } catch (error) {
           // Log any errors that occur during task execution
@@ -419,6 +416,24 @@ const generateFilteredTestcases = (): Testcase[] => {
             error: JSON.parse(JSON.stringify(error, null, 2)),
             logs: logger.getLogs(),
           };
+        } finally {
+          // Always close V3 instance, regardless of success or failure.
+          // This ensures proper cleanup even if the task threw an error or
+          // the Browserbase session disconnected mid-execution.
+          if (v3Input?.v3) {
+            try {
+              await v3Input.v3.close();
+            } catch (closeError) {
+              // Log but don't throw - we don't want close errors to mask
+              // the original task result or prevent subsequent evals
+              console.error(
+                `Warning: Error closing V3 instance for ${input.name}:`,
+                closeError,
+              );
+            }
+          }
+          // Clear logger to free memory (logs already captured in result)
+          logger.clear();
         }
       },
       // Use the scoring functions defined above
diff --git a/packages/evals/logger.ts b/packages/evals/logger.ts
@@ -121,4 +121,14 @@ export class EvalLogger {
   getLogs(): LogLineEval[] {
     return this.logs || [];
   }
+
+  /**
+   * clear:
+   * Clears all stored logs to free memory.
+   * Should be called after logs have been retrieved and processed.
+   */
+  clear(): void {
+    this.logs = [];
+    this.stagehand = undefined;
+  }
 }