buffbench: fix up trace analysis to use prompt instead of spec, handle errors, run even with one agent

jahooma · jahooma · commit 0aba60646318 · 2025-10-13T14:54:12.000-07:00
diff --git a/evals/buffbench/run-buffbench.ts b/evals/buffbench/run-buffbench.ts
@@ -170,65 +170,55 @@ export async function runBuffBench(options: {
       })
 
       const agentResults = await Promise.all(agentPromises) // After all agents complete for this commit, run trace analysis
-      if (commitTraces.length > 1) {
-        try {
-          const analysis = await analyzeAgentTraces({
-            client,
-            traces: commitTraces,
-            spec: commit.spec,
-          })
-
-          // Save analysis to logs directory
-          const safeTaskId = commit.id.replace(/[^a-zA-Z0-9-]/g, '_')
-          const analysisCommitShort = commit.sha.slice(0, 7)
-          const analysisFilename = `${safeTaskId}-ANALYSIS-${analysisCommitShort}.json`
-          const analysisPath = path.join(logsDir, analysisFilename)
 
-          const analysisData = {
-            commitSha: commit.sha,
-            timestamp: new Date().toISOString(),
-            ...analysis,
-            results: commitTraces.map((t) => ({
-              agentId: t.agentId,
-              ...t.judgeResult,
-              cost: t.cost,
-              durationMs: t.durationMs,
-              error: t.error,
-            })),
-            spec: commit.spec,
-          }
+      const traceAnalysis = await analyzeAgentTraces({
+        client,
+        traces: commitTraces,
+        codingAgentPrompt: commit.prompt,
+      })
 
-          const { overallAnalysis, agentFeedback } = analysis
-          fs.writeFileSync(analysisPath, JSON.stringify(analysisData, null, 2))
-
-          // Print all agent results with their judging, then trace analysis together
-          console.log(
-            formatTaskResults({
-              commit,
-              taskNumber: index + 1,
-              totalTasks: commitsToRun.length,
-              agentResults: commitTraces.map((trace) => ({
-                agentId: trace.agentId,
-                judging: trace.judgeResult,
-                cost: trace.cost,
-                durationMs: trace.durationMs,
-                error: trace.error,
-                traceFilePath: path.join(
-                  logsDir,
-                  `${commit.id.replace(/[^a-zA-Z0-9-]/g, '_')}-${trace.agentId.replace(/[^a-zA-Z0-9-]/g, '_')}-${commit.sha.slice(0, 7)}.json`,
-                ),
-              })),
-              traceAnalysis: { overallAnalysis, agentFeedback },
-            }),
-          )
-        } catch (error) {
-          console.error(
-            `Failed to analyze traces for commit ${commit.sha}:`,
-            error,
-          )
-        }
+      const analysisData = {
+        commitSha: commit.sha,
+        timestamp: new Date().toISOString(),
+        ...traceAnalysis,
+        results: commitTraces.map((t) => ({
+          agentId: t.agentId,
+          ...t.judgeResult,
+          cost: t.cost,
+          durationMs: t.durationMs,
+          error: t.error,
+        })),
+        prompt: commit.prompt,
       }
 
+      // Save analysis to logs directory
+      const safeTaskId = commit.id.replace(/[^a-zA-Z0-9-]/g, '_')
+      const analysisCommitShort = commit.sha.slice(0, 7)
+      const analysisFilename = `${safeTaskId}-ANALYSIS-${analysisCommitShort}.json`
+      const analysisPath = path.join(logsDir, analysisFilename)
+      fs.writeFileSync(analysisPath, JSON.stringify(analysisData, null, 2))
+
+      // Print all agent results with their judging, then trace analysis together
+      console.log(
+        formatTaskResults({
+          commit,
+          taskNumber: index + 1,
+          totalTasks: commitsToRun.length,
+          agentResults: commitTraces.map((trace) => ({
+            agentId: trace.agentId,
+            judging: trace.judgeResult,
+            cost: trace.cost,
+            durationMs: trace.durationMs,
+            error: trace.error,
+            traceFilePath: path.join(
+              logsDir,
+              `${commit.id.replace(/[^a-zA-Z0-9-]/g, '_')}-${trace.agentId.replace(/[^a-zA-Z0-9-]/g, '_')}-${commit.sha.slice(0, 7)}.json`,
+            ),
+          })),
+          traceAnalysis,
+        }),
+      )
+
       return { commit, agentResults }
     }),
   )
diff --git a/evals/buffbench/trace-analyzer.ts b/evals/buffbench/trace-analyzer.ts
@@ -3,6 +3,7 @@ import type { JudgingResult } from './judge'
 import type { AgentDefinition } from '../../sdk/src'
 import type { CodebuffClient } from '../../sdk/src/client'
 import { withTimeout } from '@codebuff/common/util/promise'
+import { getErrorObject } from '@codebuff/common/util/error'
 
 export interface AgentTraceData {
   agentId: string
@@ -156,7 +157,7 @@ const traceAnalyzerAgent: AgentDefinition = {
 ## Your Role
 
 You will receive:
-1. A task specification (for context only)
+1. A task prompt (for context only)
 2. Full traces from each agent showing their step-by-step process
 3. Performance metrics (scores, cost, time, errors)
 
@@ -190,11 +191,11 @@ Note: read_files tool results show [TRUNCATED] for file contents to save space.`
 export async function analyzeAgentTraces({
   client,
   traces,
-  spec,
+  codingAgentPrompt,
 }: {
   client: CodebuffClient
   traces: AgentTraceData[]
-  spec: string
+  codingAgentPrompt: string
 }): Promise<{
   overallAnalysis: string
   agentFeedback: Array<{
@@ -204,17 +205,18 @@ export async function analyzeAgentTraces({
     recommendations: string[]
   }>
 }> {
-  const truncatedTraces = traces.map((t) => ({
-    agentId: t.agentId,
-    trace: truncateTrace(t.trace),
-    judgeResult: t.judgeResult,
-    cost: t.cost,
-    durationMs: t.durationMs,
-    error: t.error,
-  }))
+  try {
+    const truncatedTraces = traces.map((t) => ({
+      agentId: t.agentId,
+      trace: truncateTrace(t.trace),
+      judgeResult: t.judgeResult,
+      cost: t.cost,
+      durationMs: t.durationMs,
+      error: t.error,
+    }))
 
-  const prompt = `## Task Specification (for context)
-${spec}
+    const prompt = `## Coding Agent Prompt (for context)
+${codingAgentPrompt}
 
 ## Agent Traces and Results
 ${JSON.stringify(truncatedTraces, null, 2)}
@@ -239,39 +241,46 @@ Analyze how these agents approached the problem, focusing on their processes and
 
 Focus on the HOW, not the WHAT: We want to understand and improve how agents work, not evaluate their specific code output.`
 
-  const agentOutput: string[] = []
-  const analyzerResult = await withTimeout(
-    client.run({
-      agent: 'git-evals2-trace-analyzer',
-      prompt,
-      agentDefinitions: [traceAnalyzerAgent],
-      handleEvent: (event) => {
-        if (event.type === 'text') {
-          agentOutput.push(event.text)
-        } else if (event.type === 'tool_call') {
-          agentOutput.push(JSON.stringify(event, null, 2))
-        } else if (event.type === 'error') {
-          console.warn('[Trace Analyzer] Error event:', event.message)
-        }
-      },
-    }),
-    10 * 60 * 1000,
-    'Trace analyzer agent timed out after 10 minutes',
-  )
+    const agentOutput: string[] = []
+    const analyzerResult = await withTimeout(
+      client.run({
+        agent: 'git-evals2-trace-analyzer',
+        prompt,
+        agentDefinitions: [traceAnalyzerAgent],
+        handleEvent: (event) => {
+          if (event.type === 'text') {
+            agentOutput.push(event.text)
+          } else if (event.type === 'tool_call') {
+            agentOutput.push(JSON.stringify(event, null, 2))
+          } else if (event.type === 'error') {
+            console.warn('[Trace Analyzer] Error event:', event.message)
+          }
+        },
+      }),
+      10 * 60 * 1000,
+      'Trace analyzer agent timed out after 10 minutes',
+    )
 
-  const { output } = analyzerResult
+    const { output } = analyzerResult
 
-  if (output.type !== 'structuredOutput' || output.value === null) {
-    console.error(
-      'Error running trace analyzer - not structured output',
-      JSON.stringify(output, null, 2),
-    )
-    console.error('Trace analyzer output trace:', agentOutput.join(''))
+    if (output.type !== 'structuredOutput' || output.value === null) {
+      console.error(
+        'Error running trace analyzer - not structured output',
+        JSON.stringify(output, null, 2),
+      )
+      console.error('Trace analyzer output trace:', agentOutput.join(''))
+      return {
+        overallAnalysis: 'Error running trace analyzer - not structured output',
+        agentFeedback: [],
+      }
+    }
+
+    return output.value as any
+  } catch (error) {
+    console.error(`Failed to analyze traces:`, getErrorObject(error))
     return {
-      overallAnalysis: 'Error running trace analyzer - not structured output',
+      overallAnalysis: `Error running trace analyzer: ${getErrorObject(error).message}`,
       agentFeedback: [],
     }
   }
-
-  return output.value as any
 }