Skip to content

Commit f6d89b4

Browse files
committed
Exclude eval tasks with errors from scores
1 parent 7963f2b commit f6d89b4

File tree

2 files changed

+33
-14
lines changed

2 files changed

+33
-14
lines changed

evals/buffbench/eval-codebuff.json

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3074,8 +3074,6 @@
30743074
"backend/src/__tests__/main-prompt.test.ts",
30753075
"backend/src/__tests__/tool-call-schema.test.ts",
30763076
"backend/src/__tests__/run-agent-step-tools.test.ts",
3077-
"common/src/util/parse-tool-call-xml.ts",
3078-
"common/src/util/__tests__/parse-tool-call-xml.test.ts",
30793077
"common/src/templates/agent-validation.ts",
30803078
"common/src/types/dynamic-agent-template.ts",
30813079
"npm-app/src/cli-handlers/agents.ts"

evals/buffbench/run-buffbench.ts

Lines changed: 33 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -249,9 +249,19 @@ export async function runBuffBench(options: {
249249

250250
const commitResults = await Promise.allSettled(commitPromises)
251251

252+
// Track which commits had any agent errors
253+
const commitShasWithErrors = new Set<string>()
254+
252255
for (const result of commitResults) {
253256
if (result.status === 'fulfilled') {
254-
const { agentResults } = result.value
257+
const { commit, agentResults } = result.value
258+
259+
// Check if any agent had an error for this commit
260+
const hasAnyError = agentResults.some(({ evalRun }) => evalRun.error)
261+
if (hasAnyError) {
262+
commitShasWithErrors.add(commit.sha)
263+
}
264+
255265
for (const { agentId, evalRun } of agentResults) {
256266
results[agentId].runs.push(evalRun)
257267
}
@@ -261,23 +271,26 @@ export async function runBuffBench(options: {
261271
}
262272

263273
for (const [_agentId, agentData] of Object.entries(results)) {
264-
const successfulRuns = agentData.runs.filter((r) => !r.error)
265-
const totalRuns = agentData.runs.length
274+
// Filter out runs from commits where ANY agent had an error
275+
const validRuns = agentData.runs.filter(
276+
(r) => !commitShasWithErrors.has(r.commitSha),
277+
)
266278

267279
agentData.averageScore =
268-
successfulRuns.length > 0
269-
? successfulRuns.reduce((sum, r) => sum + r.judging.overallScore, 0) /
270-
successfulRuns.length
280+
validRuns.length > 0
281+
? validRuns.reduce((sum, r) => sum + r.judging.overallScore, 0) /
282+
validRuns.length
271283
: 0
272284

273285
agentData.averageCost =
274-
totalRuns > 0
275-
? agentData.runs.reduce((sum, r) => sum + r.cost, 0) / totalRuns
286+
validRuns.length > 0
287+
? validRuns.reduce((sum, r) => sum + r.cost, 0) / validRuns.length
276288
: 0
277289

278290
agentData.averageDuration =
279-
totalRuns > 0
280-
? agentData.runs.reduce((sum, r) => sum + r.durationMs, 0) / totalRuns
291+
validRuns.length > 0
292+
? validRuns.reduce((sum, r) => sum + r.durationMs, 0) /
293+
validRuns.length
281294
: 0
282295
}
283296

@@ -303,16 +316,24 @@ export async function runBuffBench(options: {
303316
fs.writeFileSync(finalResultsPath, JSON.stringify(finalResults, null, 2))
304317

305318
console.log(`Traces saved to ${logsDir}`)
319+
if (commitShasWithErrors.size > 0) {
320+
console.log(
321+
`\nNote: ${commitShasWithErrors.size} commit(s) had agent errors and were excluded from averages`,
322+
)
323+
}
306324
console.log('\n=== Summary ===')
307325
for (const [agentId, data] of Object.entries(results)) {
326+
const validRuns = data.runs.filter(
327+
(r) => !commitShasWithErrors.has(r.commitSha),
328+
)
308329
console.log(`\n${agentId}:`)
309330
console.log(` Average Score: ${data.averageScore.toFixed(2)}/10`)
310-
console.log(` Average Cost: $${data.averageCost.toFixed(4)}`)
331+
console.log(` Average Cost: ${data.averageCost.toFixed(4)}`)
311332
console.log(
312333
` Average Duration: ${(data.averageDuration / 1000).toFixed(1)}s`,
313334
)
314335
console.log(
315-
` Success: ${data.runs.filter((r) => !r.error).length}/${data.runs.length}`,
336+
` Valid runs: ${validRuns.length}/${data.runs.length} (excluding ${commitShasWithErrors.size} commit(s) with errors)`,
316337
)
317338
}
318339

0 commit comments

Comments
 (0)