Skip to content

Commit 5a4dcf0

Browse files
committed
evals: compare multi-agents
1 parent 04c5e06 commit 5a4dcf0

File tree

5 files changed

+493
-26
lines changed

5 files changed

+493
-26
lines changed

evals/README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,19 @@ bun run evals/git-evals/run-git-evals.ts \
160160
codebuff
161161
```
162162

163+
### Multi-Agent Comparison
164+
165+
The evaluation system supports running multiple agents in parallel on the same eval tasks to compare their performance side-by-side. Multi-agent mode is automatically activated when you specify multiple comma-separated agents with the `--agents` flag.
166+
167+
#### Usage
168+
169+
```bash
170+
# Compare three agents on codebuff eval set
171+
bun run evals/git-evals/run-eval-set.ts \
172+
--agents base,base2,base-lite
173+
```
174+
175+
163176
### Creating New Evaluations
164177

165178
#### 1. Pick Commits from Repository
Lines changed: 275 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,275 @@
1+
import fs from 'fs'
2+
import path from 'path'
3+
4+
import { runGitEvals } from './run-git-evals'
5+
import type { EvalConfig, FullEvalLog } from './types'
6+
7+
export interface AgentConfig {
8+
agentId: string
9+
displayName?: string
10+
}
11+
12+
export interface MultiAgentEvalOptions {
13+
agents: AgentConfig[]
14+
evalConfigs: EvalConfig[]
15+
outputDir: string
16+
concurrency?: number
17+
codingAgent: 'codebuff' | 'claude'
18+
worktreePath?: string
19+
promptWithAgent: boolean
20+
}
21+
22+
export interface AgentEvalResult {
23+
agentId: string
24+
displayName: string
25+
evalResults: Map<string, FullEvalLog>
26+
aggregateMetrics: {
27+
avgOverallScore: number
28+
avgCompletionScore: number
29+
avgCodeQualityScore: number
30+
avgCostUsd: number
31+
avgDurationMs: number
32+
successRate: number
33+
}
34+
errors: Array<{ evalSetName: string; error: any }>
35+
}
36+
37+
export async function runMultiAgentEvals(
38+
options: MultiAgentEvalOptions,
39+
): Promise<AgentEvalResult[]> {
40+
const {
41+
agents,
42+
evalConfigs,
43+
outputDir,
44+
codingAgent,
45+
worktreePath,
46+
promptWithAgent,
47+
concurrency,
48+
} = options
49+
50+
const agentPromises = agents.map(async (agentConfig) => {
51+
const { agentId, displayName = agentId } = agentConfig
52+
53+
console.log(`\n${'='.repeat(60)}`)
54+
console.log(`Starting evaluations for agent: ${displayName} (${agentId})`)
55+
console.log('='.repeat(60))
56+
57+
const evalResults = new Map<string, FullEvalLog>()
58+
const errors: Array<{ evalSetName: string; error: any }> = []
59+
60+
const evalSetPromises = evalConfigs.map(async (config) => {
61+
console.log(` Running ${config.name} eval set for ${displayName}...`)
62+
63+
try {
64+
const agentOutputDir = path.join(outputDir, agentId)
65+
if (!fs.existsSync(agentOutputDir)) {
66+
fs.mkdirSync(agentOutputDir, { recursive: true })
67+
}
68+
69+
const result = await runGitEvals(
70+
config.evalDataPath,
71+
agentOutputDir,
72+
codingAgent,
73+
config.limit,
74+
false,
75+
agentId,
76+
worktreePath,
77+
promptWithAgent,
78+
)
79+
80+
evalResults.set(config.name, result)
81+
console.log(` ✅ ${config.name} completed for ${displayName}`)
82+
return { success: true, evalSetName: config.name }
83+
} catch (error) {
84+
console.error(` ❌ ${config.name} failed for ${displayName}:`, error)
85+
errors.push({ evalSetName: config.name, error })
86+
return { success: false, evalSetName: config.name, error }
87+
}
88+
})
89+
90+
await Promise.allSettled(evalSetPromises)
91+
92+
const aggregateMetrics = calculateAggregateMetrics(
93+
Array.from(evalResults.values()),
94+
)
95+
96+
return {
97+
agentId,
98+
displayName,
99+
evalResults,
100+
aggregateMetrics,
101+
errors,
102+
}
103+
})
104+
105+
const results = await Promise.all(agentPromises)
106+
107+
return results
108+
}
109+
110+
function calculateAggregateMetrics(evalLogs: FullEvalLog[]) {
111+
if (evalLogs.length === 0) {
112+
return {
113+
avgOverallScore: 0,
114+
avgCompletionScore: 0,
115+
avgCodeQualityScore: 0,
116+
avgCostUsd: 0,
117+
avgDurationMs: 0,
118+
successRate: 0,
119+
}
120+
}
121+
122+
const totalMetrics = evalLogs.reduce(
123+
(acc, log) => ({
124+
overallScore: acc.overallScore + log.overall_metrics.average_overall,
125+
completionScore:
126+
acc.completionScore + log.overall_metrics.average_completion,
127+
codeQualityScore:
128+
acc.codeQualityScore + log.overall_metrics.average_code_quality,
129+
costUsd: acc.costUsd + log.overall_metrics.average_cost_usd,
130+
durationMs: acc.durationMs + log.overall_metrics.average_duration_ms,
131+
successfulRuns: acc.successfulRuns + log.overall_metrics.successful_runs,
132+
totalRuns: acc.totalRuns + log.overall_metrics.total_runs,
133+
}),
134+
{
135+
overallScore: 0,
136+
completionScore: 0,
137+
codeQualityScore: 0,
138+
costUsd: 0,
139+
durationMs: 0,
140+
successfulRuns: 0,
141+
totalRuns: 0,
142+
},
143+
)
144+
145+
const count = evalLogs.length
146+
147+
return {
148+
avgOverallScore: totalMetrics.overallScore / count,
149+
avgCompletionScore: totalMetrics.completionScore / count,
150+
avgCodeQualityScore: totalMetrics.codeQualityScore / count,
151+
avgCostUsd: totalMetrics.costUsd / count,
152+
avgDurationMs: totalMetrics.durationMs / count,
153+
successRate: totalMetrics.successfulRuns / totalMetrics.totalRuns,
154+
}
155+
}
156+
157+
export function printComparisonTable(
158+
results: AgentEvalResult[],
159+
evalSetNames: string[],
160+
) {
161+
console.log('\n' + '='.repeat(100))
162+
console.log('MULTI-AGENT COMPARISON RESULTS')
163+
console.log('='.repeat(100))
164+
165+
const colWidth = 20
166+
console.log('\nAggregate Metrics Across All Eval Sets:')
167+
console.log('-'.repeat(100))
168+
169+
const header = [
170+
'Agent'.padEnd(colWidth),
171+
'Overall'.padEnd(12),
172+
'Completion'.padEnd(12),
173+
'Quality'.padEnd(12),
174+
'Success Rate'.padEnd(14),
175+
'Avg Cost ($)'.padEnd(12),
176+
].join(' | ')
177+
178+
console.log(header)
179+
console.log('-'.repeat(100))
180+
181+
const sortedResults = [...results].sort(
182+
(a, b) =>
183+
b.aggregateMetrics.avgOverallScore - a.aggregateMetrics.avgOverallScore,
184+
)
185+
186+
sortedResults.forEach((result) => {
187+
const row = [
188+
result.displayName.padEnd(colWidth),
189+
result.aggregateMetrics.avgOverallScore.toFixed(2).padEnd(12),
190+
result.aggregateMetrics.avgCompletionScore.toFixed(2).padEnd(12),
191+
result.aggregateMetrics.avgCodeQualityScore.toFixed(2).padEnd(12),
192+
`${(result.aggregateMetrics.successRate * 100).toFixed(1)}%`.padEnd(14),
193+
result.aggregateMetrics.avgCostUsd.toFixed(3).padEnd(12),
194+
].join(' | ')
195+
196+
console.log(row)
197+
})
198+
199+
console.log('\n\nPer Eval Set Breakdown:')
200+
console.log('='.repeat(100))
201+
202+
evalSetNames.forEach((evalSetName) => {
203+
console.log(`\n${evalSetName.toUpperCase()}:`)
204+
console.log('-'.repeat(100))
205+
206+
const header = [
207+
'Agent'.padEnd(colWidth),
208+
'Overall'.padEnd(12),
209+
'Completion'.padEnd(12),
210+
'Quality'.padEnd(12),
211+
'Runs'.padEnd(10),
212+
'Cost ($)'.padEnd(10),
213+
].join(' | ')
214+
215+
console.log(header)
216+
console.log('-'.repeat(100))
217+
218+
sortedResults.forEach((result) => {
219+
const evalLog = result.evalResults.get(evalSetName)
220+
221+
if (!evalLog) {
222+
const errorInfo = result.errors.find(
223+
(e) => e.evalSetName === evalSetName,
224+
)
225+
const errorMsg = errorInfo
226+
? ` (${errorInfo.error.message || 'Unknown error'})`
227+
: ''
228+
console.log(
229+
`${result.displayName.padEnd(colWidth)} | N/A - Failed to run${errorMsg}`,
230+
)
231+
return
232+
}
233+
234+
const metrics = evalLog.overall_metrics
235+
const row = [
236+
result.displayName.padEnd(colWidth),
237+
metrics.average_overall.toFixed(2).padEnd(12),
238+
metrics.average_completion.toFixed(2).padEnd(12),
239+
metrics.average_code_quality.toFixed(2).padEnd(12),
240+
`${metrics.successful_runs}/${metrics.total_runs}`.padEnd(10),
241+
metrics.average_cost_usd.toFixed(3).padEnd(10),
242+
].join(' | ')
243+
244+
console.log(row)
245+
})
246+
})
247+
248+
console.log('\n' + '='.repeat(100))
249+
}
250+
251+
export function writeComparisonResults(
252+
results: AgentEvalResult[],
253+
outputDir: string,
254+
traceId: string,
255+
) {
256+
const comparisonData = {
257+
timestamp: new Date().toISOString(),
258+
traceId,
259+
agents: results.map((result) => ({
260+
agentId: result.agentId,
261+
displayName: result.displayName,
262+
aggregateMetrics: result.aggregateMetrics,
263+
evalSets: Array.from(result.evalResults.entries()).map(([name, log]) => ({
264+
name,
265+
metrics: log.overall_metrics,
266+
})),
267+
errors: result.errors,
268+
})),
269+
}
270+
271+
const comparisonPath = path.join(outputDir, `eval-comparison-${traceId}.json`)
272+
273+
fs.writeFileSync(comparisonPath, JSON.stringify(comparisonData, null, 2))
274+
console.log(`\n📊 Comparison results written to: ${comparisonPath}`)
275+
}

0 commit comments

Comments
 (0)