Skip to content

Commit 8cff9ac

Browse files
authored
[Evals] task error handling and memory cleanup (#1419)
# why Error handling on tasks was causing the runner to idle. Also, proper log cleanup is needed to prevent memory leaks. # what changed - Remove nested try/finally around task execution, let main try/catch handle cleanup - Add cleanup to EvalLogger # test plan <!-- This is an auto-generated description by cubic. --> --- ## Summary by cubic Improves eval task error handling and cleanup to prevent memory leaks and dangling V3 sessions. Always closes resources and clears logs after each task run. - **Bug Fixes** - Always close the V3 instance in a finally block; log close errors as warnings so they don’t mask task results. - Clear the EvalLogger after returning logs to free memory. - Track v3Input at the outer scope to ensure cleanup on both success and failure. - **Refactors** - Simplified result logging and removed nested try/finally around task execution. <sup>Written for commit 34aebe9. Summary will update automatically on new commits.</sup> <!-- End of auto-generated description by cubic. -->
1 parent b40ae11 commit 8cff9ac

File tree

2 files changed

+39
-14
lines changed

2 files changed

+39
-14
lines changed

packages/evals/index.eval.ts

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,9 @@ const generateFilteredTestcases = (): Testcase[] => {
271271
// Each test is a function that runs the corresponding task module
272272
task: async (input: EvalInput) => {
273273
const logger = new EvalLogger();
274+
// Track V3 instance at outer scope to ensure cleanup in all cases
275+
let v3Input: Awaited<ReturnType<typeof initV3>> | undefined;
276+
274277
try {
275278
// Dynamically import the task based on its name
276279
const taskModulePath = path.join(
@@ -323,9 +326,6 @@ const generateFilteredTestcases = (): Testcase[] => {
323326
}
324327

325328
// Execute the task
326-
// let taskInput: Awaited<ReturnType<typeof initStagehand>>;
327-
let v3Input: Awaited<ReturnType<typeof initV3>> | undefined;
328-
329329
const isAgentTask =
330330
input.name.startsWith("agent/") || input.name.includes("/agent/");
331331
if (USE_API) {
@@ -384,18 +384,15 @@ const generateFilteredTestcases = (): Testcase[] => {
384384
});
385385
}
386386
// Pass full EvalInput to the task (data-driven params available via input.params)
387-
let result;
388-
try {
389-
result = await taskFunction({ ...v3Input, input });
390-
// Log result to console
391-
if (result && result._success) {
392-
console.log(`✅ ${input.name}: Passed`);
393-
} else {
394-
console.log(`❌ ${input.name}: Failed`);
395-
}
396-
} finally {
397-
if (v3Input?.v3) await v3Input.v3.close();
387+
const result = await taskFunction({ ...v3Input, input });
388+
389+
// Log result to console
390+
if (result && result._success) {
391+
console.log(`✅ ${input.name}: Passed`);
392+
} else {
393+
console.log(`❌ ${input.name}: Failed`);
398394
}
395+
399396
return result;
400397
} catch (error) {
401398
// Log any errors that occur during task execution
@@ -419,6 +416,24 @@ const generateFilteredTestcases = (): Testcase[] => {
419416
error: JSON.parse(JSON.stringify(error, null, 2)),
420417
logs: logger.getLogs(),
421418
};
419+
} finally {
420+
// Always close V3 instance, regardless of success or failure.
421+
// This ensures proper cleanup even if the task threw an error or
422+
// the Browserbase session disconnected mid-execution.
423+
if (v3Input?.v3) {
424+
try {
425+
await v3Input.v3.close();
426+
} catch (closeError) {
427+
// Log but don't throw - we don't want close errors to mask
428+
// the original task result or prevent subsequent evals
429+
console.error(
430+
`Warning: Error closing V3 instance for ${input.name}:`,
431+
closeError,
432+
);
433+
}
434+
}
435+
// Clear logger to free memory (logs already captured in result)
436+
logger.clear();
422437
}
423438
},
424439
// Use the scoring functions defined above

packages/evals/logger.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,4 +121,14 @@ export class EvalLogger {
121121
getLogs(): LogLineEval[] {
122122
return this.logs || [];
123123
}
124+
125+
/**
126+
* clear:
127+
* Clears all stored logs to free memory.
128+
* Should be called after logs have been retrieved and processed.
129+
*/
130+
clear(): void {
131+
this.logs = [];
132+
this.stagehand = undefined;
133+
}
124134
}

0 commit comments

Comments
 (0)