Skip to content

Commit 0d4b107

Browse files
authored
Eval claude codex (#387)
1 parent d27fe0c commit 0d4b107

File tree

10 files changed

+542
-64
lines changed

10 files changed

+542
-64
lines changed

evals/buffbench/README.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,37 @@ The AI judge evaluates three dimensions:
133133
- **Binary Installation**: Install required tools (e.g., linters, test runners) in isolated environments
134134
- **Custom Environment**: Set environment variables for evaluation runs
135135

136+
### External CLI Agents
137+
138+
BuffBench supports running external CLI coding agents for comparison:
139+
140+
- **Claude Code**: Use `external:claude` - requires `claude` CLI installed
141+
- **Codex**: Use `external:codex` - requires `codex` CLI installed
142+
143+
Example comparing Codebuff vs Claude Code:
144+
145+
```typescript
146+
await runBuffBench({
147+
evalDataPath: 'evals/buffbench/eval-codebuff.json',
148+
agents: ['base2', 'external:claude'],
149+
taskConcurrency: 3,
150+
})
151+
```
152+
153+
### Prerequisites for External Agents
154+
155+
**Claude Code CLI:**
156+
```bash
157+
npm install -g @anthropic-ai/claude-code
158+
# Set ANTHROPIC_API_KEY or CLAUDE_CODE_KEY environment variable
159+
```
160+
161+
**Codex CLI:**
162+
```bash
163+
npm install -g @openai/codex
164+
# Set OPENAI_API_KEY environment variable
165+
```
166+
136167
## Directory Structure
137168

138169
```

evals/buffbench/agent-runner.ts

Lines changed: 35 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
import fs from 'fs'
2-
import path from 'path'
31
import { execSync } from 'child_process'
42
import { promisify } from 'util'
53
import { exec } from 'child_process'
@@ -9,13 +7,16 @@ const execAsync = promisify(exec)
97
import { withTimeout } from '@codebuff/common/util/promise'
108
import { CodebuffClient } from '@codebuff/sdk'
119
import { withTestRepo } from '../subagents/test-repo-utils'
10+
import { ClaudeRunner } from './runners/claude'
11+
import { CodexRunner } from './runners/codex'
12+
import { CodebuffRunner } from './runners/codebuff'
1213

13-
import type { PrintModeEvent } from '@codebuff/common/types/print-mode'
1414
import type { EvalCommitV2, FinalCheckOutput } from './types'
15+
import type { Runner, AgentStep } from './runners/runner'
1516

16-
export type AgentStep = PrintModeEvent
17+
export type { AgentStep }
1718

18-
const DEBUG_ERROR = true
19+
export type ExternalAgentType = 'claude' | 'codex'
1920

2021
export async function runAgentOnCommit({
2122
client,
@@ -27,6 +28,7 @@ export async function runAgentOnCommit({
2728
localAgentDefinitions,
2829
printEvents,
2930
finalCheckCommands,
31+
externalAgentType,
3032
}: {
3133
client: CodebuffClient
3234
agentId: string
@@ -37,6 +39,7 @@ export async function runAgentOnCommit({
3739
localAgentDefinitions: any[]
3840
printEvents: boolean
3941
finalCheckCommands?: string[]
42+
externalAgentType?: ExternalAgentType
4043
}): Promise<{
4144
diff: string
4245
contextFiles: Record<string, string>
@@ -66,59 +69,33 @@ export async function runAgentOnCommit({
6669
env,
6770
},
6871
async (repoDir) => {
69-
const maxAgentSteps = 40
70-
const result = await client.run({
71-
agent: agentId,
72-
prompt: commit.prompt,
73-
agentDefinitions: localAgentDefinitions,
74-
cwd: repoDir,
75-
env,
76-
maxAgentSteps,
77-
handleEvent: (event) => {
78-
if (
79-
(event.type === 'tool_call' || event.type === 'tool_result') &&
80-
event.toolName === 'set_messages'
81-
) {
82-
return
83-
}
84-
if (event.type === 'error') {
85-
console.error(
86-
`[${commit.id}:${agentId}] Error event:`,
87-
event.message,
88-
)
89-
if (DEBUG_ERROR && !event.message.startsWith('Invalid JSON')) {
90-
// Save errors in a file, but not tool calls with invalid json.
91-
fs.writeFileSync(
92-
path.join(
93-
__dirname,
94-
`${commit.id}-${agentId}-error-${Math.random().toString(36).substring(2, 6)}.json`,
95-
),
96-
JSON.stringify(
97-
{
98-
error: event.message,
99-
trace: trace,
100-
},
101-
null,
102-
2,
103-
),
104-
)
105-
}
106-
} else if (printEvents) {
107-
console.log(
108-
`[${commit.id}:${agentId}]`,
109-
JSON.stringify(event, null, 2),
110-
)
111-
}
112-
trace.push(event)
113-
},
114-
})
115-
cost = (result.sessionState?.mainAgentState.creditsUsed ?? 0) / 100
116-
117-
execSync('git add .', { cwd: repoDir, stdio: 'ignore' })
118-
diff = execSync(`git diff ${commit.parentSha}`, {
119-
cwd: repoDir,
120-
encoding: 'utf-8',
121-
})
72+
// Select the appropriate runner
73+
let runner: Runner
74+
if (externalAgentType === 'claude') {
75+
runner = new ClaudeRunner(repoDir, env)
76+
} else if (externalAgentType === 'codex') {
77+
runner = new CodexRunner(repoDir, env)
78+
} else {
79+
runner = new CodebuffRunner({
80+
cwd: repoDir,
81+
env,
82+
client,
83+
agentId,
84+
localAgentDefinitions,
85+
printEvents,
86+
commitId: commit.id,
87+
parentSha: commit.parentSha,
88+
})
89+
}
90+
91+
console.log(
92+
`[${commit.id}] Running agent: ${externalAgentType || 'codebuff'}`,
93+
)
94+
95+
const result = await runner.run(commit.prompt)
96+
trace.push(...result.steps)
97+
cost = result.totalCostUsd
98+
diff = result.diff
12299

123100
const contextFilePaths = new Set<string>([
124101
...commit.supplementalFiles,

evals/buffbench/main-single-eval.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ import { runBuffBench } from './run-buffbench'
55
async function main() {
66
await runBuffBench({
77
evalDataPath: path.join(__dirname, 'eval-codebuff.json'),
8-
agents: ['base2-opus'],
9-
taskIds: ['add-spawn-perms-tests'],
8+
agents: ['base2'],
9+
taskIds: ['filter-system-history'],
1010
})
1111

1212
process.exit(0)

evals/buffbench/main.ts

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,13 @@ import path from 'path'
33
import { runBuffBench } from './run-buffbench'
44

55
async function main() {
6+
// Compare Codebuff agents against external CLI agents
7+
// Use 'external:claude' for Claude Code CLI
8+
// Use 'external:codex' for OpenAI Codex CLI
69
await runBuffBench({
710
evalDataPath: path.join(__dirname, 'eval-codebuff.json'),
8-
agents: ['base2', 'base2-max'],
9-
taskConcurrency: 3,
11+
agents: ['base2', 'external:claude', 'external:codex'],
12+
taskConcurrency: 1,
1013
})
1114

1215
process.exit(0)

evals/buffbench/run-buffbench.ts

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import { getUserCredentials } from '@codebuff/npm-app/credentials'
88
import { loadLocalAgents } from '@codebuff/npm-app/agents/load-agents'
99
import pLimit from 'p-limit'
1010

11-
import { runAgentOnCommit } from './agent-runner'
11+
import { runAgentOnCommit, type ExternalAgentType } from './agent-runner'
1212
import { formatTaskResults } from './format-output'
1313
import { judgeCommitResult } from './judge'
1414
import { analyzeAgentTraces, type AgentTraceData } from './trace-analyzer'
@@ -18,6 +18,22 @@ import { logger } from '../logger'
1818
import type { AgentEvalResults, EvalDataV2 } from './types'
1919
import { analyzeAllTasks } from './meta-analyzer'
2020

21+
function parseAgentId(agent: string): {
22+
agentId: string
23+
externalAgentType?: ExternalAgentType
24+
} {
25+
if (agent.startsWith('external:')) {
26+
const externalType = agent.slice('external:'.length) as ExternalAgentType
27+
if (externalType !== 'claude' && externalType !== 'codex') {
28+
throw new Error(
29+
`Unknown external agent type: ${externalType}. Supported: claude, codex`,
30+
)
31+
}
32+
return { agentId: agent, externalAgentType: externalType }
33+
}
34+
return { agentId: agent }
35+
}
36+
2137
async function runTask(options: {
2238
client: CodebuffClient
2339
commit: EvalDataV2['evalCommits'][0]
@@ -64,7 +80,9 @@ async function runTask(options: {
6480
// Store trace data for this commit to analyze later
6581
const commitTraces: AgentTraceData[] = []
6682

67-
const agentPromises = agents.map(async (agentId) => {
83+
const agentPromises = agents.map(async (agent) => {
84+
const { agentId, externalAgentType } = parseAgentId(agent)
85+
6886
const agentResult = await runAgentOnCommit({
6987
client,
7088
agentId,
@@ -75,6 +93,7 @@ async function runTask(options: {
7593
localAgentDefinitions,
7694
printEvents,
7795
finalCheckCommands,
96+
externalAgentType,
7897
})
7998

8099
const judgeResult = await judgeCommitResult({

0 commit comments

Comments
 (0)