@@ -249,9 +249,19 @@ export async function runBuffBench(options: {
249249
250250 const commitResults = await Promise . allSettled ( commitPromises )
251251
252+ // Track which commits had any agent errors
253+ const commitShasWithErrors = new Set < string > ( )
254+
252255 for ( const result of commitResults ) {
253256 if ( result . status === 'fulfilled' ) {
254- const { agentResults } = result . value
257+ const { commit, agentResults } = result . value
258+
259+ // Check if any agent had an error for this commit
260+ const hasAnyError = agentResults . some ( ( { evalRun } ) => evalRun . error )
261+ if ( hasAnyError ) {
262+ commitShasWithErrors . add ( commit . sha )
263+ }
264+
255265 for ( const { agentId, evalRun } of agentResults ) {
256266 results [ agentId ] . runs . push ( evalRun )
257267 }
@@ -261,23 +271,26 @@ export async function runBuffBench(options: {
261271 }
262272
263273 for ( const [ _agentId , agentData ] of Object . entries ( results ) ) {
264- const successfulRuns = agentData . runs . filter ( ( r ) => ! r . error )
265- const totalRuns = agentData . runs . length
274+ // Filter out runs from commits where ANY agent had an error
275+ const validRuns = agentData . runs . filter (
276+ ( r ) => ! commitShasWithErrors . has ( r . commitSha ) ,
277+ )
266278
267279 agentData . averageScore =
268- successfulRuns . length > 0
269- ? successfulRuns . reduce ( ( sum , r ) => sum + r . judging . overallScore , 0 ) /
270- successfulRuns . length
280+ validRuns . length > 0
281+ ? validRuns . reduce ( ( sum , r ) => sum + r . judging . overallScore , 0 ) /
282+ validRuns . length
271283 : 0
272284
273285 agentData . averageCost =
274- totalRuns > 0
275- ? agentData . runs . reduce ( ( sum , r ) => sum + r . cost , 0 ) / totalRuns
286+ validRuns . length > 0
287+ ? validRuns . reduce ( ( sum , r ) => sum + r . cost , 0 ) / validRuns . length
276288 : 0
277289
278290 agentData . averageDuration =
279- totalRuns > 0
280- ? agentData . runs . reduce ( ( sum , r ) => sum + r . durationMs , 0 ) / totalRuns
291+ validRuns . length > 0
292+ ? validRuns . reduce ( ( sum , r ) => sum + r . durationMs , 0 ) /
293+ validRuns . length
281294 : 0
282295 }
283296
@@ -303,16 +316,24 @@ export async function runBuffBench(options: {
303316 fs . writeFileSync ( finalResultsPath , JSON . stringify ( finalResults , null , 2 ) )
304317
305318 console . log ( `Traces saved to ${ logsDir } ` )
319+ if ( commitShasWithErrors . size > 0 ) {
320+ console . log (
321+ `\nNote: ${ commitShasWithErrors . size } commit(s) had agent errors and were excluded from averages` ,
322+ )
323+ }
306324 console . log ( '\n=== Summary ===' )
307325 for ( const [ agentId , data ] of Object . entries ( results ) ) {
326+ const validRuns = data . runs . filter (
327+ ( r ) => ! commitShasWithErrors . has ( r . commitSha ) ,
328+ )
308329 console . log ( `\n${ agentId } :` )
309330 console . log ( ` Average Score: ${ data . averageScore . toFixed ( 2 ) } /10` )
310- console . log ( ` Average Cost: $$ {data . averageCost . toFixed ( 4 ) } ` )
331+ console . log ( ` Average Cost: ${ data . averageCost . toFixed ( 4 ) } ` )
311332 console . log (
312333 ` Average Duration: ${ ( data . averageDuration / 1000 ) . toFixed ( 1 ) } s` ,
313334 )
314335 console . log (
315- ` Success : ${ data . runs . filter ( ( r ) => ! r . error ) . length } /${ data . runs . length } ` ,
336+ ` Valid runs : ${ validRuns . length } /${ data . runs . length } (excluding ${ commitShasWithErrors . size } commit(s) with errors) ` ,
316337 )
317338 }
318339
0 commit comments