Skip to content

Commit d3155d1

Browse files
committed
buffbench: add timeouts for judge and trace analyzer
1 parent b340cf0 commit d3155d1

File tree

3 files changed

+41
-36
lines changed

3 files changed

+41
-36
lines changed

evals/buffbench/judge.ts

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import { z } from 'zod/v4'
33
import type { FileDiff } from './types'
44
import type { AgentDefinition } from '../../sdk/src'
55
import type { CodebuffClient } from '../../sdk/src/client'
6+
import { withTimeout } from '@codebuff/common/util/promise'
67

78
export const JudgingResultSchema = z.object({
89
analysis: z
@@ -166,32 +167,31 @@ ${agentDiff || '(No changes made)'}
166167
${error ? `\n## Error Encountered\n${error}` : ''}`
167168

168169
const agentOutput: string[] = []
169-
const judgeResult = await client.run({
170-
agent: 'git-evals2-judge',
171-
prompt: judgePrompt,
172-
agentDefinitions: [judgeAgent],
173-
handleEvent: (event) => {
174-
if (event.type === 'text') {
175-
agentOutput.push(event.text)
176-
}
177-
else if (event.type === 'tool_call') {
178-
agentOutput.push(JSON.stringify(event, null, 2))
179-
}
180-
else if (event.type === 'error') {
181-
console.warn('[Judge] Error event:', event.message)
182-
}
183-
},
184-
})
170+
const judgeResult = await withTimeout(
171+
client.run({
172+
agent: 'git-evals2-judge',
173+
prompt: judgePrompt,
174+
agentDefinitions: [judgeAgent],
175+
handleEvent: (event) => {
176+
if (event.type === 'text') {
177+
agentOutput.push(event.text)
178+
} else if (event.type === 'tool_call') {
179+
agentOutput.push(JSON.stringify(event, null, 2))
180+
} else if (event.type === 'error') {
181+
console.warn('[Judge] Error event:', event.message)
182+
}
183+
},
184+
}),
185+
10 * 60 * 1000,
186+
'Judge agent timed out after 10 minutes',
187+
)
185188

186189
if (judgeResult.output.type !== 'structuredOutput') {
187190
console.error(
188191
'Error running judge agent - not structured output',
189192
JSON.stringify(judgeResult.output, null, 2),
190193
)
191-
console.error(
192-
'Judge agent output trace:',
193-
agentOutput.join(''),
194-
)
194+
console.error('Judge agent output trace:', agentOutput.join(''))
195195
return {
196196
analysis: 'Error running judge agent - not structured output',
197197
strengths: [],

evals/buffbench/run-buffbench.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,6 @@ export async function runBuffBench(options: {
6060
console.log(
6161
`\n=== Task ${index + 1}/${commitsToRun.length}: ${commit.id} (${commit.sha.slice(0, 7)}) ===`,
6262
)
63-
console.log(`Prompt: ${commit.prompt}`)
6463

6564
// Store trace data for this commit to analyze later
6665
const commitTraces: AgentTraceData[] = []

evals/buffbench/trace-analyzer.ts

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import type { AgentStep } from './agent-runner'
22
import type { JudgingResult } from './judge'
33
import type { AgentDefinition } from '../../sdk/src'
44
import type { CodebuffClient } from '../../sdk/src/client'
5+
import { withTimeout } from '@codebuff/common/util/promise'
56

67
export interface AgentTraceData {
78
agentId: string
@@ -140,7 +141,8 @@ const traceAnalyzerAgent: AgentDefinition = {
140141
recommendations: {
141142
type: 'array',
142143
items: { type: 'string' },
143-
description: 'Recommendations for improving this agent and it\'s process. Note: do not include recommendations for improving the code in this task',
144+
description:
145+
"Recommendations for improving this agent and it's process. Note: do not include recommendations for improving the code in this task",
144146
},
145147
},
146148
required: ['agentId', 'strengths', 'weaknesses', 'recommendations'],
@@ -238,20 +240,24 @@ Analyze how these agents approached the problem, focusing on their processes and
238240
Focus on the HOW, not the WHAT: We want to understand and improve how agents work, not evaluate their specific code output.`
239241

240242
const agentOutput: string[] = []
241-
const analyzerResult = await client.run({
242-
agent: 'git-evals2-trace-analyzer',
243-
prompt,
244-
agentDefinitions: [traceAnalyzerAgent],
245-
handleEvent: (event) => {
246-
if (event.type === 'text') {
247-
agentOutput.push(event.text)
248-
} else if (event.type === 'tool_call') {
249-
agentOutput.push(JSON.stringify(event, null, 2))
250-
} else if (event.type === 'error') {
251-
console.warn('[Trace Analyzer] Error event:', event.message)
252-
}
253-
},
254-
})
243+
const analyzerResult = await withTimeout(
244+
client.run({
245+
agent: 'git-evals2-trace-analyzer',
246+
prompt,
247+
agentDefinitions: [traceAnalyzerAgent],
248+
handleEvent: (event) => {
249+
if (event.type === 'text') {
250+
agentOutput.push(event.text)
251+
} else if (event.type === 'tool_call') {
252+
agentOutput.push(JSON.stringify(event, null, 2))
253+
} else if (event.type === 'error') {
254+
console.warn('[Trace Analyzer] Error event:', event.message)
255+
}
256+
},
257+
}),
258+
10 * 60 * 1000,
259+
'Trace analyzer agent timed out after 10 minutes',
260+
)
255261

256262
const { output } = analyzerResult
257263

0 commit comments

Comments
 (0)