CodebuffAI
diff --git a/‎evals/buffbench/README.md‎
Lines changed: 31 additions & 0 deletions b/‎evals/buffbench/README.md‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎evals/buffbench/agent-runner.ts‎
Lines changed: 35 additions & 58 deletions b/‎evals/buffbench/agent-runner.ts‎
Lines changed: 35 additions & 58 deletions
diff --git a/‎evals/buffbench/main-single-eval.ts‎
Lines changed: 2 additions & 2 deletions b/‎evals/buffbench/main-single-eval.ts‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎evals/buffbench/main.ts‎
Lines changed: 5 additions & 2 deletions b/‎evals/buffbench/main.ts‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎evals/buffbench/run-buffbench.ts‎
Lines changed: 21 additions & 2 deletions b/‎evals/buffbench/run-buffbench.ts‎
Lines changed: 21 additions & 2 deletions
@@ -133,6 +133,37 @@ The AI judge evaluates three dimensions:
 - **Binary Installation**: Install required tools (e.g., linters, test runners) in isolated environments
 - **Custom Environment**: Set environment variables for evaluation runs
 
+### External CLI Agents
+
+BuffBench supports running external CLI coding agents for comparison:
+
+- **Claude Code**: Use `external:claude` - requires `claude` CLI installed
+- **Codex**: Use `external:codex` - requires `codex` CLI installed
+
+Example comparing Codebuff vs Claude Code:
+
+```typescript
+await runBuffBench({
+  evalDataPath: 'evals/buffbench/eval-codebuff.json',
+  agents: ['base2', 'external:claude'],
+  taskConcurrency: 3,
+})
+```
+
+### Prerequisites for External Agents
+
+**Claude Code CLI:**
+```bash
+npm install -g @anthropic-ai/claude-code
+# Set ANTHROPIC_API_KEY or CLAUDE_CODE_KEY environment variable
+```
+
+**Codex CLI:**
+```bash
+npm install -g @openai/codex
+# Set OPENAI_API_KEY environment variable
+```
+
 ## Directory Structure
 
 ```
 
@@ -1,5 +1,3 @@
-import fs from 'fs'
-import path from 'path'
 import { execSync } from 'child_process'
 import { promisify } from 'util'
 import { exec } from 'child_process'
@@ -9,13 +7,16 @@ const execAsync = promisify(exec)
 import { withTimeout } from '@codebuff/common/util/promise'
 import { CodebuffClient } from '@codebuff/sdk'
 import { withTestRepo } from '../subagents/test-repo-utils'
+import { ClaudeRunner } from './runners/claude'
+import { CodexRunner } from './runners/codex'
+import { CodebuffRunner } from './runners/codebuff'
 
-import type { PrintModeEvent } from '@codebuff/common/types/print-mode'
 import type { EvalCommitV2, FinalCheckOutput } from './types'
+import type { Runner, AgentStep } from './runners/runner'
 
-export type AgentStep = PrintModeEvent
+export type { AgentStep }
 
-const DEBUG_ERROR = true
+export type ExternalAgentType = 'claude' | 'codex'
 
 export async function runAgentOnCommit({
   client,
@@ -27,6 +28,7 @@ export async function runAgentOnCommit({
   localAgentDefinitions,
   printEvents,
   finalCheckCommands,
+  externalAgentType,
 }: {
   client: CodebuffClient
   agentId: string
@@ -37,6 +39,7 @@ export async function runAgentOnCommit({
   localAgentDefinitions: any[]
   printEvents: boolean
   finalCheckCommands?: string[]
+  externalAgentType?: ExternalAgentType
 }): Promise<{
   diff: string
   contextFiles: Record<string, string>
@@ -66,59 +69,33 @@ export async function runAgentOnCommit({
           env,
         },
         async (repoDir) => {
-          const maxAgentSteps = 40
-          const result = await client.run({
-            agent: agentId,
-            prompt: commit.prompt,
-            agentDefinitions: localAgentDefinitions,
-            cwd: repoDir,
-            env,
-            maxAgentSteps,
-            handleEvent: (event) => {
-              if (
-                (event.type === 'tool_call' || event.type === 'tool_result') &&
-                event.toolName === 'set_messages'
-              ) {
-                return
-              }
-              if (event.type === 'error') {
-                console.error(
-                  `[${commit.id}:${agentId}] Error event:`,
-                  event.message,
-                )
-                if (DEBUG_ERROR && !event.message.startsWith('Invalid JSON')) {
-                  // Save errors in a file, but not tool calls with invalid json.
-                  fs.writeFileSync(
-                    path.join(
-                      __dirname,
-                      `${commit.id}-${agentId}-error-${Math.random().toString(36).substring(2, 6)}.json`,
-                    ),
-                    JSON.stringify(
-                      {
-                        error: event.message,
-                        trace: trace,
-                      },
-                      null,
-                      2,
-                    ),
-                  )
-                }
-              } else if (printEvents) {
-                console.log(
-                  `[${commit.id}:${agentId}]`,
-                  JSON.stringify(event, null, 2),
-                )
-              }
-              trace.push(event)
-            },
-          })
-          cost = (result.sessionState?.mainAgentState.creditsUsed ?? 0) / 100
-
-          execSync('git add .', { cwd: repoDir, stdio: 'ignore' })
-          diff = execSync(`git diff ${commit.parentSha}`, {
-            cwd: repoDir,
-            encoding: 'utf-8',
-          })
+          // Select the appropriate runner
+          let runner: Runner
+          if (externalAgentType === 'claude') {
+            runner = new ClaudeRunner(repoDir, env)
+          } else if (externalAgentType === 'codex') {
+            runner = new CodexRunner(repoDir, env)
+          } else {
+            runner = new CodebuffRunner({
+              cwd: repoDir,
+              env,
+              client,
+              agentId,
+              localAgentDefinitions,
+              printEvents,
+              commitId: commit.id,
+              parentSha: commit.parentSha,
+            })
+          }
+
+          console.log(
+            `[${commit.id}] Running agent: ${externalAgentType || 'codebuff'}`,
+          )
+
+          const result = await runner.run(commit.prompt)
+          trace.push(...result.steps)
+          cost = result.totalCostUsd
+          diff = result.diff
 
           const contextFilePaths = new Set<string>([
             ...commit.supplementalFiles,
 
@@ -5,8 +5,8 @@ import { runBuffBench } from './run-buffbench'
 async function main() {
   await runBuffBench({
     evalDataPath: path.join(__dirname, 'eval-codebuff.json'),
-    agents: ['base2-opus'],
-    taskIds: ['add-spawn-perms-tests'],
+    agents: ['base2'],
+    taskIds: ['filter-system-history'],
   })
 
   process.exit(0)
 
@@ -3,10 +3,13 @@ import path from 'path'
 import { runBuffBench } from './run-buffbench'
 
 async function main() {
+  // Compare Codebuff agents against external CLI agents
+  // Use 'external:claude' for Claude Code CLI
+  // Use 'external:codex' for OpenAI Codex CLI
   await runBuffBench({
     evalDataPath: path.join(__dirname, 'eval-codebuff.json'),
-    agents: ['base2', 'base2-max'],
-    taskConcurrency: 3,
+    agents: ['base2', 'external:claude', 'external:codex'],
+    taskConcurrency: 1,
   })
 
   process.exit(0)
 
@@ -8,7 +8,7 @@ import { getUserCredentials } from '@codebuff/npm-app/credentials'
 import { loadLocalAgents } from '@codebuff/npm-app/agents/load-agents'
 import pLimit from 'p-limit'
 
-import { runAgentOnCommit } from './agent-runner'
+import { runAgentOnCommit, type ExternalAgentType } from './agent-runner'
 import { formatTaskResults } from './format-output'
 import { judgeCommitResult } from './judge'
 import { analyzeAgentTraces, type AgentTraceData } from './trace-analyzer'
@@ -18,6 +18,22 @@ import { logger } from '../logger'
 import type { AgentEvalResults, EvalDataV2 } from './types'
 import { analyzeAllTasks } from './meta-analyzer'
 
+function parseAgentId(agent: string): {
+  agentId: string
+  externalAgentType?: ExternalAgentType
+} {
+  if (agent.startsWith('external:')) {
+    const externalType = agent.slice('external:'.length) as ExternalAgentType
+    if (externalType !== 'claude' && externalType !== 'codex') {
+      throw new Error(
+        `Unknown external agent type: ${externalType}. Supported: claude, codex`,
+      )
+    }
+    return { agentId: agent, externalAgentType: externalType }
+  }
+  return { agentId: agent }
+}
+
 async function runTask(options: {
   client: CodebuffClient
   commit: EvalDataV2['evalCommits'][0]
@@ -64,7 +80,9 @@ async function runTask(options: {
   // Store trace data for this commit to analyze later
   const commitTraces: AgentTraceData[] = []
 
-  const agentPromises = agents.map(async (agentId) => {
+  const agentPromises = agents.map(async (agent) => {
+    const { agentId, externalAgentType } = parseAgentId(agent)
+
     const agentResult = await runAgentOnCommit({
       client,
       agentId,
@@ -75,6 +93,7 @@ async function runTask(options: {
       localAgentDefinitions,
       printEvents,
       finalCheckCommands,
+      externalAgentType,
     })
 
     const judgeResult = await judgeCommitResult({