Skip to content

Commit da40c54

Browse files
committed
Add script to print score table of buffbench logs; prepend task number to log file name
1 parent 61c8f51 commit da40c54

File tree

3 files changed

+197
-5
lines changed

3 files changed

+197
-5
lines changed

evals/buffbench/main.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,15 @@ import { runBuffBench } from './run-buffbench'
66
async function main() {
77
const results = await runBuffBench({
88
evalDataPath: path.join(__dirname, 'eval-codebuff.json'),
9-
agents: ['base', 'base2'],
10-
commitConcurrency: 10,
9+
agents: ['base2-simple', 'base2'],
10+
commitConcurrency: 20,
1111
})
1212

1313
const outputPath = path.join(__dirname, 'results.json')
1414
fs.writeFileSync(outputPath, JSON.stringify(results, null, 2))
1515
console.log(`\nResults written to ${outputPath}`)
16+
17+
process.exit(0)
1618
}
1719

1820
if (import.meta.main) {

evals/buffbench/run-buffbench.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ async function runTask(options: {
7373
const safeTaskId = commit.id.replace(/[^a-zA-Z0-9-]/g, '_')
7474
const safeAgentId = agentId.replace(/[^a-zA-Z0-9-]/g, '_')
7575
const safeCommitShort = commit.sha.slice(0, 7)
76-
const traceFilename = `${safeTaskId}-${safeAgentId}-${safeCommitShort}.json`
76+
const traceFilename = `${index + 1}-${safeTaskId}-${safeAgentId}-${safeCommitShort}.json`
7777
const tracePath = path.join(logsDir, traceFilename)
7878

7979
// Store judging result and trace for combined output later
@@ -124,7 +124,7 @@ async function runTask(options: {
124124
// Save analysis to logs directory
125125
const safeTaskId = commit.id.replace(/[^a-zA-Z0-9-]/g, '_')
126126
const analysisCommitShort = commit.sha.slice(0, 7)
127-
const analysisFilename = `${safeTaskId}-ANALYSIS-${analysisCommitShort}.json`
127+
const analysisFilename = `${index + 1}-${safeTaskId}-ANALYSIS-${analysisCommitShort}.json`
128128
const analysisPath = path.join(logsDir, analysisFilename)
129129
fs.writeFileSync(analysisPath, JSON.stringify(analysisData, null, 2))
130130

@@ -142,7 +142,7 @@ async function runTask(options: {
142142
error: trace.error,
143143
traceFilePath: path.join(
144144
logsDir,
145-
`${commit.id.replace(/[^a-zA-Z0-9-]/g, '_')}-${trace.agentId.replace(/[^a-zA-Z0-9-]/g, '_')}-${commit.sha.slice(0, 7)}.json`,
145+
`${index + 1}-${commit.id.replace(/[^a-zA-Z0-9-]/g, '_')}-${trace.agentId.replace(/[^a-zA-Z0-9-]/g, '_')}-${commit.sha.slice(0, 7)}.json`,
146146
),
147147
})),
148148
traceAnalysis,

scripts/analyze-buffbench-logs.ts

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
#!/usr/bin/env bun
2+
import { readdirSync, readFileSync } from 'fs'
3+
import { join } from 'path'
4+
5+
interface JudgingResult {
6+
analysis: string
7+
strengths: string[]
8+
weaknesses: string[]
9+
completionScore: number
10+
codeQualityScore: number
11+
overallScore: number
12+
}
13+
14+
interface AgentResult {
15+
agentId: string
16+
analysis: string
17+
strengths: string[]
18+
weaknesses: string[]
19+
completionScore: number
20+
codeQualityScore: number
21+
overallScore: number
22+
cost: number
23+
durationMs: number
24+
}
25+
26+
interface AnalysisFile {
27+
commitSha: string
28+
timestamp: string
29+
results: AgentResult[]
30+
}
31+
32+
function analyzeBuffbenchLogs(
33+
logDirectory: string,
34+
filterBottom25 = false,
35+
) {
36+
const files = readdirSync(logDirectory)
37+
const analysisFiles = files.filter((f) => f.includes('ANALYSIS'))
38+
39+
const agentScores: Record<
40+
string,
41+
{
42+
scores: number[]
43+
completionScores: number[]
44+
qualityScores: number[]
45+
costs: number[]
46+
durations: number[]
47+
}
48+
> = {}
49+
50+
for (const file of analysisFiles) {
51+
const filePath = join(logDirectory, file)
52+
const content = readFileSync(filePath, 'utf-8')
53+
const data: AnalysisFile = JSON.parse(content)
54+
55+
for (const result of data.results) {
56+
if (!agentScores[result.agentId]) {
57+
agentScores[result.agentId] = {
58+
scores: [],
59+
completionScores: [],
60+
qualityScores: [],
61+
costs: [],
62+
durations: [],
63+
}
64+
}
65+
66+
agentScores[result.agentId].scores.push(result.overallScore)
67+
agentScores[result.agentId].completionScores.push(result.completionScore)
68+
agentScores[result.agentId].qualityScores.push(result.codeQualityScore)
69+
agentScores[result.agentId].costs.push(result.cost)
70+
agentScores[result.agentId].durations.push(result.durationMs)
71+
}
72+
}
73+
74+
// Filter bottom 25% if requested
75+
if (filterBottom25) {
76+
for (const agentId in agentScores) {
77+
const data = agentScores[agentId]
78+
// Sort scores to find the 25th percentile
79+
const sortedScores = [...data.scores].sort((a, b) => a - b)
80+
const cutoffIndex = Math.floor(sortedScores.length * 0.25)
81+
const cutoffScore = sortedScores[cutoffIndex]
82+
83+
// Filter out tasks below the cutoff
84+
const filteredIndices = data.scores
85+
.map((score, idx) => (score >= cutoffScore ? idx : -1))
86+
.filter((idx) => idx !== -1)
87+
88+
agentScores[agentId] = {
89+
scores: filteredIndices.map((idx) => data.scores[idx]),
90+
completionScores: filteredIndices.map(
91+
(idx) => data.completionScores[idx],
92+
),
93+
qualityScores: filteredIndices.map((idx) => data.qualityScores[idx]),
94+
costs: filteredIndices.map((idx) => data.costs[idx]),
95+
durations: filteredIndices.map((idx) => data.durations[idx]),
96+
}
97+
}
98+
}
99+
100+
// Calculate averages and stats
101+
const results = Object.entries(agentScores).map(([agentId, data]) => {
102+
const avgOverall =
103+
data.scores.reduce((a, b) => a + b, 0) / data.scores.length
104+
const avgCompletion =
105+
data.completionScores.reduce((a, b) => a + b, 0) /
106+
data.completionScores.length
107+
const avgQuality =
108+
data.qualityScores.reduce((a, b) => a + b, 0) /
109+
data.qualityScores.length
110+
111+
const minOverall = Math.min(...data.scores)
112+
113+
// Calculate standard deviation
114+
const variance =
115+
data.scores.reduce((sum, score) => sum + Math.pow(score - avgOverall, 2), 0) /
116+
data.scores.length
117+
const stdDev = Math.sqrt(variance)
118+
119+
const avgCost = data.costs.reduce((a, b) => a + b, 0) / data.costs.length
120+
const avgDuration =
121+
data.durations.reduce((a, b) => a + b, 0) / data.durations.length
122+
123+
return {
124+
agentId,
125+
count: data.scores.length,
126+
averageOverallScore: avgOverall,
127+
averageCompletionScore: avgCompletion,
128+
averageQualityScore: avgQuality,
129+
minOverallScore: minOverall,
130+
stdDevOverall: stdDev,
131+
averageCost: avgCost,
132+
averageDurationMs: avgDuration,
133+
}
134+
})
135+
136+
// Sort by average overall score descending
137+
results.sort((a, b) => b.averageOverallScore - a.averageOverallScore)
138+
139+
return results
140+
}
141+
142+
// Main execution
143+
const logDirectory = process.argv[2] || 'evals/buffbench/logs/2025-10-13T20-07'
144+
145+
console.log(`Analyzing logs from: ${logDirectory}\n`)
146+
147+
function printTable(results: ReturnType<typeof analyzeBuffbenchLogs>, title: string) {
148+
console.log(title)
149+
console.log('=' .repeat(130))
150+
console.log(
151+
'Agent ID'.padEnd(20),
152+
'Count'.padEnd(8),
153+
'Overall'.padEnd(10),
154+
'Min'.padEnd(8),
155+
'StdDev'.padEnd(10),
156+
'Completion'.padEnd(12),
157+
'Quality'.padEnd(10),
158+
'Cost ($)'.padEnd(10),
159+
'Duration (s)',
160+
)
161+
console.log('=' .repeat(130))
162+
163+
for (const result of results) {
164+
console.log(
165+
result.agentId.padEnd(20),
166+
result.count.toString().padEnd(8),
167+
result.averageOverallScore.toFixed(2).padEnd(10),
168+
result.minOverallScore.toFixed(2).padEnd(8),
169+
result.stdDevOverall.toFixed(2).padEnd(10),
170+
result.averageCompletionScore.toFixed(2).padEnd(12),
171+
result.averageQualityScore.toFixed(2).padEnd(10),
172+
result.averageCost.toFixed(2).padEnd(10),
173+
(result.averageDurationMs / 1000).toFixed(1),
174+
)
175+
}
176+
177+
console.log('=' .repeat(130))
178+
console.log(`Total agents analyzed: ${results.length}`)
179+
}
180+
181+
const allResults = analyzeBuffbenchLogs(logDirectory, false)
182+
printTable(allResults, 'Agent Performance Summary (All Tasks):')
183+
184+
console.log('\n')
185+
186+
const filteredResults = analyzeBuffbenchLogs(logDirectory, true)
187+
printTable(
188+
filteredResults,
189+
'Agent Performance Summary (Top 75% Tasks by Overall Score):',
190+
)

0 commit comments

Comments
 (0)