Skip to content

Commit 0aba606

Browse files
committed
buffbench: fix up trace analysis to use prompt instead of spec, handle errors, run even with one agent
1 parent d3155d1 commit 0aba606

File tree

2 files changed

+96
-97
lines changed

2 files changed

+96
-97
lines changed

evals/buffbench/run-buffbench.ts

Lines changed: 45 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -170,65 +170,55 @@ export async function runBuffBench(options: {
170170
})
171171

172172
const agentResults = await Promise.all(agentPromises) // After all agents complete for this commit, run trace analysis
173-
if (commitTraces.length > 1) {
174-
try {
175-
const analysis = await analyzeAgentTraces({
176-
client,
177-
traces: commitTraces,
178-
spec: commit.spec,
179-
})
180-
181-
// Save analysis to logs directory
182-
const safeTaskId = commit.id.replace(/[^a-zA-Z0-9-]/g, '_')
183-
const analysisCommitShort = commit.sha.slice(0, 7)
184-
const analysisFilename = `${safeTaskId}-ANALYSIS-${analysisCommitShort}.json`
185-
const analysisPath = path.join(logsDir, analysisFilename)
186173

187-
const analysisData = {
188-
commitSha: commit.sha,
189-
timestamp: new Date().toISOString(),
190-
...analysis,
191-
results: commitTraces.map((t) => ({
192-
agentId: t.agentId,
193-
...t.judgeResult,
194-
cost: t.cost,
195-
durationMs: t.durationMs,
196-
error: t.error,
197-
})),
198-
spec: commit.spec,
199-
}
174+
const traceAnalysis = await analyzeAgentTraces({
175+
client,
176+
traces: commitTraces,
177+
codingAgentPrompt: commit.prompt,
178+
})
200179

201-
const { overallAnalysis, agentFeedback } = analysis
202-
fs.writeFileSync(analysisPath, JSON.stringify(analysisData, null, 2))
203-
204-
// Print all agent results with their judging, then trace analysis together
205-
console.log(
206-
formatTaskResults({
207-
commit,
208-
taskNumber: index + 1,
209-
totalTasks: commitsToRun.length,
210-
agentResults: commitTraces.map((trace) => ({
211-
agentId: trace.agentId,
212-
judging: trace.judgeResult,
213-
cost: trace.cost,
214-
durationMs: trace.durationMs,
215-
error: trace.error,
216-
traceFilePath: path.join(
217-
logsDir,
218-
`${commit.id.replace(/[^a-zA-Z0-9-]/g, '_')}-${trace.agentId.replace(/[^a-zA-Z0-9-]/g, '_')}-${commit.sha.slice(0, 7)}.json`,
219-
),
220-
})),
221-
traceAnalysis: { overallAnalysis, agentFeedback },
222-
}),
223-
)
224-
} catch (error) {
225-
console.error(
226-
`Failed to analyze traces for commit ${commit.sha}:`,
227-
error,
228-
)
229-
}
180+
const analysisData = {
181+
commitSha: commit.sha,
182+
timestamp: new Date().toISOString(),
183+
...traceAnalysis,
184+
results: commitTraces.map((t) => ({
185+
agentId: t.agentId,
186+
...t.judgeResult,
187+
cost: t.cost,
188+
durationMs: t.durationMs,
189+
error: t.error,
190+
})),
191+
prompt: commit.prompt,
230192
}
231193

194+
// Save analysis to logs directory
195+
const safeTaskId = commit.id.replace(/[^a-zA-Z0-9-]/g, '_')
196+
const analysisCommitShort = commit.sha.slice(0, 7)
197+
const analysisFilename = `${safeTaskId}-ANALYSIS-${analysisCommitShort}.json`
198+
const analysisPath = path.join(logsDir, analysisFilename)
199+
fs.writeFileSync(analysisPath, JSON.stringify(analysisData, null, 2))
200+
201+
// Print all agent results with their judging, then trace analysis together
202+
console.log(
203+
formatTaskResults({
204+
commit,
205+
taskNumber: index + 1,
206+
totalTasks: commitsToRun.length,
207+
agentResults: commitTraces.map((trace) => ({
208+
agentId: trace.agentId,
209+
judging: trace.judgeResult,
210+
cost: trace.cost,
211+
durationMs: trace.durationMs,
212+
error: trace.error,
213+
traceFilePath: path.join(
214+
logsDir,
215+
`${commit.id.replace(/[^a-zA-Z0-9-]/g, '_')}-${trace.agentId.replace(/[^a-zA-Z0-9-]/g, '_')}-${commit.sha.slice(0, 7)}.json`,
216+
),
217+
})),
218+
traceAnalysis,
219+
}),
220+
)
221+
232222
return { commit, agentResults }
233223
}),
234224
)

evals/buffbench/trace-analyzer.ts

Lines changed: 51 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import type { JudgingResult } from './judge'
33
import type { AgentDefinition } from '../../sdk/src'
44
import type { CodebuffClient } from '../../sdk/src/client'
55
import { withTimeout } from '@codebuff/common/util/promise'
6+
import { getErrorObject } from '@codebuff/common/util/error'
67

78
export interface AgentTraceData {
89
agentId: string
@@ -156,7 +157,7 @@ const traceAnalyzerAgent: AgentDefinition = {
156157
## Your Role
157158
158159
You will receive:
159-
1. A task specification (for context only)
160+
1. A task prompt (for context only)
160161
2. Full traces from each agent showing their step-by-step process
161162
3. Performance metrics (scores, cost, time, errors)
162163
@@ -190,11 +191,11 @@ Note: read_files tool results show [TRUNCATED] for file contents to save space.`
190191
export async function analyzeAgentTraces({
191192
client,
192193
traces,
193-
spec,
194+
codingAgentPrompt,
194195
}: {
195196
client: CodebuffClient
196197
traces: AgentTraceData[]
197-
spec: string
198+
codingAgentPrompt: string
198199
}): Promise<{
199200
overallAnalysis: string
200201
agentFeedback: Array<{
@@ -204,17 +205,18 @@ export async function analyzeAgentTraces({
204205
recommendations: string[]
205206
}>
206207
}> {
207-
const truncatedTraces = traces.map((t) => ({
208-
agentId: t.agentId,
209-
trace: truncateTrace(t.trace),
210-
judgeResult: t.judgeResult,
211-
cost: t.cost,
212-
durationMs: t.durationMs,
213-
error: t.error,
214-
}))
208+
try {
209+
const truncatedTraces = traces.map((t) => ({
210+
agentId: t.agentId,
211+
trace: truncateTrace(t.trace),
212+
judgeResult: t.judgeResult,
213+
cost: t.cost,
214+
durationMs: t.durationMs,
215+
error: t.error,
216+
}))
215217

216-
const prompt = `## Task Specification (for context)
217-
${spec}
218+
const prompt = `## Coding Agent Prompt (for context)
219+
${codingAgentPrompt}
218220
219221
## Agent Traces and Results
220222
${JSON.stringify(truncatedTraces, null, 2)}
@@ -239,39 +241,46 @@ Analyze how these agents approached the problem, focusing on their processes and
239241
240242
Focus on the HOW, not the WHAT: We want to understand and improve how agents work, not evaluate their specific code output.`
241243

242-
const agentOutput: string[] = []
243-
const analyzerResult = await withTimeout(
244-
client.run({
245-
agent: 'git-evals2-trace-analyzer',
246-
prompt,
247-
agentDefinitions: [traceAnalyzerAgent],
248-
handleEvent: (event) => {
249-
if (event.type === 'text') {
250-
agentOutput.push(event.text)
251-
} else if (event.type === 'tool_call') {
252-
agentOutput.push(JSON.stringify(event, null, 2))
253-
} else if (event.type === 'error') {
254-
console.warn('[Trace Analyzer] Error event:', event.message)
255-
}
256-
},
257-
}),
258-
10 * 60 * 1000,
259-
'Trace analyzer agent timed out after 10 minutes',
260-
)
244+
const agentOutput: string[] = []
245+
const analyzerResult = await withTimeout(
246+
client.run({
247+
agent: 'git-evals2-trace-analyzer',
248+
prompt,
249+
agentDefinitions: [traceAnalyzerAgent],
250+
handleEvent: (event) => {
251+
if (event.type === 'text') {
252+
agentOutput.push(event.text)
253+
} else if (event.type === 'tool_call') {
254+
agentOutput.push(JSON.stringify(event, null, 2))
255+
} else if (event.type === 'error') {
256+
console.warn('[Trace Analyzer] Error event:', event.message)
257+
}
258+
},
259+
}),
260+
10 * 60 * 1000,
261+
'Trace analyzer agent timed out after 10 minutes',
262+
)
261263

262-
const { output } = analyzerResult
264+
const { output } = analyzerResult
263265

264-
if (output.type !== 'structuredOutput' || output.value === null) {
265-
console.error(
266-
'Error running trace analyzer - not structured output',
267-
JSON.stringify(output, null, 2),
268-
)
269-
console.error('Trace analyzer output trace:', agentOutput.join(''))
266+
if (output.type !== 'structuredOutput' || output.value === null) {
267+
console.error(
268+
'Error running trace analyzer - not structured output',
269+
JSON.stringify(output, null, 2),
270+
)
271+
console.error('Trace analyzer output trace:', agentOutput.join(''))
272+
return {
273+
overallAnalysis: 'Error running trace analyzer - not structured output',
274+
agentFeedback: [],
275+
}
276+
}
277+
278+
return output.value as any
279+
} catch (error) {
280+
console.error(`Failed to analyze traces:`, getErrorObject(error))
270281
return {
271-
overallAnalysis: 'Error running trace analyzer - not structured output',
282+
overallAnalysis: `Error running trace analyzer: ${getErrorObject(error).message}`,
272283
agentFeedback: [],
273284
}
274285
}
275-
276-
return output.value as any
277286
}

0 commit comments

Comments
 (0)