prettier output

jahooma · jahooma · commit 315a5b539d18 · 2025-10-12T15:48:46.000-07:00
diff --git a/evals/buffbench/format-output.ts b/evals/buffbench/format-output.ts
@@ -0,0 +1,145 @@
+import type { JudgingResult } from './judge'
+import type { EvalCommitV2 } from './types'
+
+export function formatAgentResult(params: {
+  agentId: string
+  commit: EvalCommitV2
+  judging: JudgingResult
+  cost: number
+  durationMs: number
+  error?: string
+  traceFilePath?: string
+}): string {
+  const { agentId, commit, judging, cost, durationMs, error, traceFilePath } =
+    params
+
+  const lines: string[] = []
+  const separator = '='.repeat(80)
+  const minorSeparator = '-'.repeat(80)
+
+  lines.push('')
+  lines.push(separator)
+  lines.push(
+    `AGENT RESULT: [${agentId}] - ${commit.id} (${commit.sha.slice(0, 7)})`,
+  )
+  lines.push(separator)
+  lines.push('')
+
+  lines.push('TASK:')
+  lines.push(minorSeparator)
+  lines.push(commit.spec)
+  lines.push('')
+
+  if (error) {
+    lines.push('❌ ERROR:')
+    lines.push(minorSeparator)
+    lines.push(error)
+    lines.push('')
+  }
+
+  lines.push('JUDGING RESULTS:')
+  lines.push(minorSeparator)
+  lines.push('')
+  lines.push('Scores:')
+  lines.push(`  Overall Score:       ${judging.overallScore.toFixed(1)}/10`)
+  lines.push(`  Completion Score:    ${judging.completionScore.toFixed(1)}/10`)
+  lines.push(`  Code Quality Score:  ${judging.codeQualityScore.toFixed(1)}/10`)
+  lines.push('')
+
+  lines.push('Analysis:')
+  lines.push(judging.analysis)
+  lines.push('')
+
+  if (judging.strengths.length > 0) {
+    lines.push('Strengths:')
+    judging.strengths.forEach((s, i) => {
+      lines.push(`  ${i + 1}. ${s}`)
+    })
+    lines.push('')
+  }
+
+  if (judging.weaknesses.length > 0) {
+    lines.push('Weaknesses:')
+    judging.weaknesses.forEach((w, i) => {
+      lines.push(`  ${i + 1}. ${w}`)
+    })
+    lines.push('')
+  }
+
+  lines.push('METRICS:')
+  lines.push(minorSeparator)
+  lines.push(`  Duration: ${(durationMs / 1000).toFixed(1)}s`)
+  lines.push(`  Cost:     $${cost.toFixed(4)}`)
+  lines.push('')
+
+  if (traceFilePath) {
+    lines.push(`Trace saved to: ${traceFilePath}`)
+    lines.push('')
+  }
+
+  lines.push(separator)
+  lines.push('')
+
+  return lines.join('\n')
+}
+
+export function formatTraceAnalysis(params: {
+  commit: EvalCommitV2
+  overallAnalysis: string
+  agentFeedback: Array<{
+    agentId: string
+    strengths: string[]
+    weaknesses: string[]
+    recommendations: string[]
+  }>
+}): string {
+  const { commit, overallAnalysis, agentFeedback } = params
+
+  const lines: string[] = []
+  const separator = '='.repeat(80)
+  const minorSeparator = '-'.repeat(80)
+
+  lines.push('')
+  lines.push(separator)
+  lines.push(`TRACE ANALYSIS: ${commit.id} (${commit.sha.slice(0, 7)})`)
+  lines.push(separator)
+  lines.push('')
+
+  lines.push('OVERALL ANALYSIS:')
+  lines.push(minorSeparator)
+  lines.push(overallAnalysis)
+  lines.push('')
+
+  if (agentFeedback.length > 0) {
+    lines.push('AGENT-SPECIFIC FEEDBACK:')
+    lines.push(minorSeparator)
+
+    agentFeedback.forEach((feedback, index) => {
+      if (index > 0) lines.push('')
+
+      lines.push(`[${feedback.agentId}]`)
+
+      if (feedback.strengths.length > 0) {
+        lines.push('  Strengths:')
+        feedback.strengths.forEach((s) => lines.push(`    • ${s}`))
+      }
+
+      if (feedback.weaknesses.length > 0) {
+        lines.push('  Weaknesses:')
+        feedback.weaknesses.forEach((w) => lines.push(`    • ${w}`))
+      }
+
+      if (feedback.recommendations.length > 0) {
+        lines.push('  Recommendations:')
+        feedback.recommendations.forEach((r) => lines.push(`    • ${r}`))
+      }
+    })
+
+    lines.push('')
+  }
+
+  lines.push(separator)
+  lines.push('')
+
+  return lines.join('\n')
+}
diff --git a/evals/buffbench/run-buffbench.ts b/evals/buffbench/run-buffbench.ts
@@ -6,6 +6,7 @@ import { getUserCredentials } from '@codebuff/npm-app/credentials'
 import pLimit from 'p-limit'
 
 import { runAgentOnCommit } from './agent-runner'
+import { formatAgentResult, formatTraceAnalysis } from './format-output'
 import { judgeCommitResult } from './judge'
 import { analyzeAgentTraces, type AgentTraceData } from './trace-analyzer'
 import { CodebuffClient } from '../../sdk/src/client'
@@ -102,18 +103,6 @@ export async function runBuffBench(options: {
             error: agentResult.error,
           })
 
-          console.log(`\n[${agentId}] Judge Results:`)
-          console.log(`  Overall Score: ${judgeResult.overallScore}/10`)
-          console.log(`  Completion: ${judgeResult.completionScore}/10`)
-          console.log(`  Code Quality: ${judgeResult.codeQualityScore}/10`)
-          console.log(`  Analysis: ${judgeResult.analysis}`)
-          if (judgeResult.strengths.length > 0) {
-            console.log(`  Strengths: ${judgeResult.strengths.join(', ')}`)
-          }
-          if (judgeResult.weaknesses.length > 0) {
-            console.log(`  Weaknesses: ${judgeResult.weaknesses.join(', ')}`)
-          }
-
           const evalRun = {
             commitSha: commit.sha,
             spec: commit.spec,
@@ -131,6 +120,17 @@ export async function runBuffBench(options: {
           const traceFilename = `${safeTaskId}-${safeAgentId}-${safeCommitShort}.json`
           const tracePath = path.join(logsDir, traceFilename)
 
+          const formattedOutput = formatAgentResult({
+            agentId,
+            commit,
+            judging: judgeResult,
+            cost: agentResult.cost,
+            durationMs: agentResult.durationMs,
+            error: agentResult.error,
+            traceFilePath: tracePath,
+          })
+          console.log(formattedOutput)
+
           const traceData = {
             agentId,
             commitSha: commit.sha,
@@ -229,30 +229,13 @@ export async function runBuffBench(options: {
 
           const { overallAnalysis, agentFeedback } = analysis
           fs.writeFileSync(analysisPath, JSON.stringify(analysisData, null, 2))
-          console.log(`Analysis saved to ${analysisPath}`)
-          console.log(`\n=== Trace Analysis ===`)
-          console.log(overallAnalysis)
-          if (agentFeedback.length > 0) {
-            console.log(`\nAgent-Specific Feedback:`)
-            agentFeedback.forEach((feedback: any) => {
-              console.log(`\n  [${feedback.agentId}]`)
-              if (feedback.strengths.length > 0) {
-                console.log(
-                  `    Strengths:\n${feedback.strengths.join('\n    - ')}}`,
-                )
-              }
-              if (feedback.weaknesses.length > 0) {
-                console.log(
-                  `    Weaknesses:\n${feedback.weaknesses.join('\n    - ')}`,
-                )
-              }
-              if (feedback.recommendations.length > 0) {
-                console.log(
-                  `    Recommendations:\n${feedback.recommendations.join('\n    - ')}`,
-                )
-              }
-            })
-          }
+
+          const formattedAnalysis = formatTraceAnalysis({
+            commit,
+            overallAnalysis,
+            agentFeedback,
+          })
+          console.log(formattedAnalysis)
         } catch (error) {
           console.error(
             `Failed to analyze traces for commit ${commit.sha}:`,