Skip to content

Commit b4480b5

Browse files
committed
buffbench: accept evalDataPaths and fix examples
1 parent 1b60433 commit b4480b5

File tree

5 files changed

+32
-9
lines changed

5 files changed

+32
-9
lines changed

evals/buffbench/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ Example comparing Codebuff vs Claude Code:
144144

145145
```typescript
146146
await runBuffBench({
147-
evalDataPath: 'evals/buffbench/eval-codebuff.json',
147+
evalDataPaths: ['evals/buffbench/eval-codebuff.json'],
148148
agents: ['base2', 'external:claude'],
149149
taskConcurrency: 3,
150150
})
@@ -204,7 +204,7 @@ evals/buffbench/
204204
import { runBuffBench } from './run-buffbench'
205205

206206
await runBuffBench({
207-
evalDataPath: 'eval-codebuff.json',
207+
evalDataPaths: ['eval-codebuff.json'],
208208
agents: ['base2', 'base2-fast'],
209209
taskConcurrency: 3,
210210
})
@@ -378,7 +378,7 @@ logs/YYYY-MM-DDTHH-MM_agent1_vs_agent2/
378378
{
379379
"metadata": {
380380
"timestamp": "2024-01-15T10:30:00.000Z",
381-
"evalDataPath": "eval-codebuff.json",
381+
"evalDataPaths": ["eval-codebuff.json"],
382382
"agentsTested": ["base2", "base2-fast"],
383383
"commitsEvaluated": 10,
384384
"logsDirectory": "logs/..."

evals/buffbench/main-nightly.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ async function main() {
1212
console.log()
1313

1414
const results = await runBuffBench({
15-
evalDataPath: path.join(__dirname, 'eval-codebuff.json'),
15+
evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')],
1616
agents: ['base2-lite'],
1717
taskConcurrency: 3,
1818
})

evals/buffbench/main-single-eval.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import { runBuffBench } from './run-buffbench'
44

55
async function main() {
66
await runBuffBench({
7-
evalDataPath: path.join(__dirname, 'eval-codebuff.json'),
7+
evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')],
88
agents: ['base2'],
99
taskIds: ['filter-system-history'],
1010
})

evals/buffbench/main.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ async function main() {
77
// Use 'external:claude' for Claude Code CLI
88
// Use 'external:codex' for OpenAI Codex CLI
99
await runBuffBench({
10-
evalDataPath: path.join(__dirname, 'eval-codebuff.json'),
10+
evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')],
1111
agents: ['base2', 'external:claude', 'external:codex'],
1212
taskConcurrency: 1,
1313
})

evals/buffbench/run-buffbench.ts

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,10 @@ function installBinaries(binInstalls: EvalDataV2['binInstalls']): {
284284
}
285285

286286
export async function runBuffBench(options: {
287-
evalDataPath: string
287+
/** Single eval data file path (backward compatibility) */
288+
evalDataPath?: string
289+
/** One or more eval data file paths */
290+
evalDataPaths?: string[]
288291
agents: string[]
289292
taskConcurrency?: number
290293
client?: CodebuffClient
@@ -294,15 +297,34 @@ export async function runBuffBench(options: {
294297
}) {
295298
const {
296299
evalDataPath,
300+
evalDataPaths,
297301
agents,
298302
taskConcurrency = 1,
299303
taskIds,
300304
extractLessons = false,
301305
disableAnalysis = false,
302306
} = options
303307

308+
const resolvedEvalDataPaths =
309+
(evalDataPaths && evalDataPaths.length > 0
310+
? evalDataPaths
311+
: evalDataPath
312+
? [evalDataPath]
313+
: undefined) ?? []
314+
315+
if (resolvedEvalDataPaths.length === 0) {
316+
throw new Error('runBuffBench: provide evalDataPaths (or evalDataPath).')
317+
}
318+
if (resolvedEvalDataPaths.length > 1) {
319+
console.warn(
320+
`runBuffBench: multiple evalDataPaths provided, using first: ${resolvedEvalDataPaths[0]}`,
321+
)
322+
}
323+
324+
const primaryEvalDataPath = resolvedEvalDataPaths[0]
325+
304326
const evalData: EvalDataV2 = JSON.parse(
305-
fs.readFileSync(evalDataPath, 'utf-8'),
327+
fs.readFileSync(primaryEvalDataPath, 'utf-8'),
306328
)
307329

308330
// Install binaries once at the beginning
@@ -512,7 +534,8 @@ export async function runBuffBench(options: {
512534
const finalResults = {
513535
metadata: {
514536
timestamp: new Date().toISOString(),
515-
evalDataPath,
537+
evalDataPath: primaryEvalDataPath,
538+
evalDataPaths: resolvedEvalDataPaths,
516539
agentsTested: agents,
517540
commitsEvaluated: commitsToRun.length,
518541
totalCommitsInEval: evalData.evalCommits.length,

0 commit comments

Comments
 (0)