buffbench: accept evalDataPaths and fix examples

brandonkachen · brandonkachen · commit b4480b5da9b4 · 2025-12-07T20:38:16.000-08:00
diff --git a/evals/buffbench/README.md b/evals/buffbench/README.md
@@ -144,7 +144,7 @@ Example comparing Codebuff vs Claude Code:
 
 ```typescript
 await runBuffBench({
-  evalDataPath: 'evals/buffbench/eval-codebuff.json',
+  evalDataPaths: ['evals/buffbench/eval-codebuff.json'],
   agents: ['base2', 'external:claude'],
   taskConcurrency: 3,
 })
@@ -204,7 +204,7 @@ evals/buffbench/
 import { runBuffBench } from './run-buffbench'
 
 await runBuffBench({
-  evalDataPath: 'eval-codebuff.json',
+  evalDataPaths: ['eval-codebuff.json'],
   agents: ['base2', 'base2-fast'],
   taskConcurrency: 3,
 })
@@ -378,7 +378,7 @@ logs/YYYY-MM-DDTHH-MM_agent1_vs_agent2/
 {
   "metadata": {
     "timestamp": "2024-01-15T10:30:00.000Z",
-    "evalDataPath": "eval-codebuff.json",
+    "evalDataPaths": ["eval-codebuff.json"],
     "agentsTested": ["base2", "base2-fast"],
     "commitsEvaluated": 10,
     "logsDirectory": "logs/..."
diff --git a/evals/buffbench/main-nightly.ts b/evals/buffbench/main-nightly.ts
@@ -12,7 +12,7 @@ async function main() {
   console.log()
 
   const results = await runBuffBench({
-    evalDataPath: path.join(__dirname, 'eval-codebuff.json'),
+    evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')],
     agents: ['base2-lite'],
     taskConcurrency: 3,
   })
diff --git a/evals/buffbench/main-single-eval.ts b/evals/buffbench/main-single-eval.ts
@@ -4,7 +4,7 @@ import { runBuffBench } from './run-buffbench'
 
 async function main() {
   await runBuffBench({
-    evalDataPath: path.join(__dirname, 'eval-codebuff.json'),
+    evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')],
     agents: ['base2'],
     taskIds: ['filter-system-history'],
   })
diff --git a/evals/buffbench/main.ts b/evals/buffbench/main.ts
@@ -7,7 +7,7 @@ async function main() {
   // Use 'external:claude' for Claude Code CLI
   // Use 'external:codex' for OpenAI Codex CLI
   await runBuffBench({
-    evalDataPath: path.join(__dirname, 'eval-codebuff.json'),
+    evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')],
     agents: ['base2', 'external:claude', 'external:codex'],
     taskConcurrency: 1,
   })
diff --git a/evals/buffbench/run-buffbench.ts b/evals/buffbench/run-buffbench.ts
@@ -284,7 +284,10 @@ function installBinaries(binInstalls: EvalDataV2['binInstalls']): {
 }
 
 export async function runBuffBench(options: {
-  evalDataPath: string
+  /** Single eval data file path (backward compatibility) */
+  evalDataPath?: string
+  /** One or more eval data file paths */
+  evalDataPaths?: string[]
   agents: string[]
   taskConcurrency?: number
   client?: CodebuffClient
@@ -294,15 +297,34 @@ export async function runBuffBench(options: {
 }) {
   const {
     evalDataPath,
+    evalDataPaths,
     agents,
     taskConcurrency = 1,
     taskIds,
     extractLessons = false,
     disableAnalysis = false,
   } = options
 
+  const resolvedEvalDataPaths =
+    (evalDataPaths && evalDataPaths.length > 0
+      ? evalDataPaths
+      : evalDataPath
+        ? [evalDataPath]
+        : undefined) ?? []
+
+  if (resolvedEvalDataPaths.length === 0) {
+    throw new Error('runBuffBench: provide evalDataPaths (or evalDataPath).')
+  }
+  if (resolvedEvalDataPaths.length > 1) {
+    console.warn(
+      `runBuffBench: multiple evalDataPaths provided, using first: ${resolvedEvalDataPaths[0]}`,
+    )
+  }
+
+  const primaryEvalDataPath = resolvedEvalDataPaths[0]
+
   const evalData: EvalDataV2 = JSON.parse(
-    fs.readFileSync(evalDataPath, 'utf-8'),
+    fs.readFileSync(primaryEvalDataPath, 'utf-8'),
   )
 
   // Install binaries once at the beginning
@@ -512,7 +534,8 @@ export async function runBuffBench(options: {
   const finalResults = {
     metadata: {
       timestamp: new Date().toISOString(),
-      evalDataPath,
+      evalDataPath: primaryEvalDataPath,
+      evalDataPaths: resolvedEvalDataPaths,
       agentsTested: agents,
       commitsEvaluated: commitsToRun.length,
       totalCommitsInEval: evalData.evalCommits.length,