buffbench: add timeouts for judge and trace analyzer

jahooma · jahooma · commit d3155d14b5b8 · 2025-10-13T14:40:24.000-07:00
diff --git a/evals/buffbench/judge.ts b/evals/buffbench/judge.ts
@@ -3,6 +3,7 @@ import { z } from 'zod/v4'
 import type { FileDiff } from './types'
 import type { AgentDefinition } from '../../sdk/src'
 import type { CodebuffClient } from '../../sdk/src/client'
+import { withTimeout } from '@codebuff/common/util/promise'
 
 export const JudgingResultSchema = z.object({
   analysis: z
@@ -166,32 +167,31 @@ ${agentDiff || '(No changes made)'}
 ${error ? `\n## Error Encountered\n${error}` : ''}`
 
   const agentOutput: string[] = []
-  const judgeResult = await client.run({
-    agent: 'git-evals2-judge',
-    prompt: judgePrompt,
-    agentDefinitions: [judgeAgent],
-    handleEvent: (event) => {
-      if (event.type === 'text') {
-        agentOutput.push(event.text)
-      }
-      else if (event.type === 'tool_call') {
-        agentOutput.push(JSON.stringify(event, null, 2))
-      }
-      else if (event.type === 'error') {
-        console.warn('[Judge] Error event:', event.message)
-      }
-    },
-  })
+  const judgeResult = await withTimeout(
+    client.run({
+      agent: 'git-evals2-judge',
+      prompt: judgePrompt,
+      agentDefinitions: [judgeAgent],
+      handleEvent: (event) => {
+        if (event.type === 'text') {
+          agentOutput.push(event.text)
+        } else if (event.type === 'tool_call') {
+          agentOutput.push(JSON.stringify(event, null, 2))
+        } else if (event.type === 'error') {
+          console.warn('[Judge] Error event:', event.message)
+        }
+      },
+    }),
+    10 * 60 * 1000,
+    'Judge agent timed out after 10 minutes',
+  )
 
   if (judgeResult.output.type !== 'structuredOutput') {
     console.error(
       'Error running judge agent - not structured output',
       JSON.stringify(judgeResult.output, null, 2),
     )
-    console.error(
-      'Judge agent output trace:',
-      agentOutput.join(''),
-    )
+    console.error('Judge agent output trace:', agentOutput.join(''))
     return {
       analysis: 'Error running judge agent - not structured output',
       strengths: [],
diff --git a/evals/buffbench/run-buffbench.ts b/evals/buffbench/run-buffbench.ts
@@ -60,7 +60,6 @@ export async function runBuffBench(options: {
       console.log(
         `\n=== Task ${index + 1}/${commitsToRun.length}: ${commit.id} (${commit.sha.slice(0, 7)}) ===`,
       )
-      console.log(`Prompt: ${commit.prompt}`)
 
       // Store trace data for this commit to analyze later
       const commitTraces: AgentTraceData[] = []
diff --git a/evals/buffbench/trace-analyzer.ts b/evals/buffbench/trace-analyzer.ts
@@ -2,6 +2,7 @@ import type { AgentStep } from './agent-runner'
 import type { JudgingResult } from './judge'
 import type { AgentDefinition } from '../../sdk/src'
 import type { CodebuffClient } from '../../sdk/src/client'
+import { withTimeout } from '@codebuff/common/util/promise'
 
 export interface AgentTraceData {
   agentId: string
@@ -140,7 +141,8 @@ const traceAnalyzerAgent: AgentDefinition = {
             recommendations: {
               type: 'array',
               items: { type: 'string' },
-              description: 'Recommendations for improving this agent and it\'s process. Note: do not include recommendations for improving the code in this task',
+              description:
+                "Recommendations for improving this agent and it's process. Note: do not include recommendations for improving the code in this task",
             },
           },
           required: ['agentId', 'strengths', 'weaknesses', 'recommendations'],
@@ -238,20 +240,24 @@ Analyze how these agents approached the problem, focusing on their processes and
 Focus on the HOW, not the WHAT: We want to understand and improve how agents work, not evaluate their specific code output.`
 
   const agentOutput: string[] = []
-  const analyzerResult = await client.run({
-    agent: 'git-evals2-trace-analyzer',
-    prompt,
-    agentDefinitions: [traceAnalyzerAgent],
-    handleEvent: (event) => {
-      if (event.type === 'text') {
-        agentOutput.push(event.text)
-      } else if (event.type === 'tool_call') {
-        agentOutput.push(JSON.stringify(event, null, 2))
-      } else if (event.type === 'error') {
-        console.warn('[Trace Analyzer] Error event:', event.message)
-      }
-    },
-  })
+  const analyzerResult = await withTimeout(
+    client.run({
+      agent: 'git-evals2-trace-analyzer',
+      prompt,
+      agentDefinitions: [traceAnalyzerAgent],
+      handleEvent: (event) => {
+        if (event.type === 'text') {
+          agentOutput.push(event.text)
+        } else if (event.type === 'tool_call') {
+          agentOutput.push(JSON.stringify(event, null, 2))
+        } else if (event.type === 'error') {
+          console.warn('[Trace Analyzer] Error event:', event.message)
+        }
+      },
+    }),
+    10 * 60 * 1000,
+    'Trace analyzer agent timed out after 10 minutes',
+  )
 
   const { output } = analyzerResult
 

Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,6 @@ export async function runBuffBench(options: {`
`60`	`60`	`console.log(`
`61`	`61`	`\n=== Task ${index + 1}/${commitsToRun.length}: ${commit.id} (${commit.sha.slice(0, 7)}) ===`,
`62`	`62`	`)`
`63`		- console.log(`Prompt: ${commit.prompt}`)
`64`	`63`
`65`	`64`	`// Store trace data for this commit to analyze later`
`66`	`65`	`const commitTraces: AgentTraceData[] = []`