Skip to content

Commit 315a5b5

Browse files
committed
prettier output
1 parent b83b0df commit 315a5b5

File tree

2 files changed

+164
-36
lines changed

2 files changed

+164
-36
lines changed

evals/buffbench/format-output.ts

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
import type { JudgingResult } from './judge'
2+
import type { EvalCommitV2 } from './types'
3+
4+
export function formatAgentResult(params: {
5+
agentId: string
6+
commit: EvalCommitV2
7+
judging: JudgingResult
8+
cost: number
9+
durationMs: number
10+
error?: string
11+
traceFilePath?: string
12+
}): string {
13+
const { agentId, commit, judging, cost, durationMs, error, traceFilePath } =
14+
params
15+
16+
const lines: string[] = []
17+
const separator = '='.repeat(80)
18+
const minorSeparator = '-'.repeat(80)
19+
20+
lines.push('')
21+
lines.push(separator)
22+
lines.push(
23+
`AGENT RESULT: [${agentId}] - ${commit.id} (${commit.sha.slice(0, 7)})`,
24+
)
25+
lines.push(separator)
26+
lines.push('')
27+
28+
lines.push('TASK:')
29+
lines.push(minorSeparator)
30+
lines.push(commit.spec)
31+
lines.push('')
32+
33+
if (error) {
34+
lines.push('❌ ERROR:')
35+
lines.push(minorSeparator)
36+
lines.push(error)
37+
lines.push('')
38+
}
39+
40+
lines.push('JUDGING RESULTS:')
41+
lines.push(minorSeparator)
42+
lines.push('')
43+
lines.push('Scores:')
44+
lines.push(` Overall Score: ${judging.overallScore.toFixed(1)}/10`)
45+
lines.push(` Completion Score: ${judging.completionScore.toFixed(1)}/10`)
46+
lines.push(` Code Quality Score: ${judging.codeQualityScore.toFixed(1)}/10`)
47+
lines.push('')
48+
49+
lines.push('Analysis:')
50+
lines.push(judging.analysis)
51+
lines.push('')
52+
53+
if (judging.strengths.length > 0) {
54+
lines.push('Strengths:')
55+
judging.strengths.forEach((s, i) => {
56+
lines.push(` ${i + 1}. ${s}`)
57+
})
58+
lines.push('')
59+
}
60+
61+
if (judging.weaknesses.length > 0) {
62+
lines.push('Weaknesses:')
63+
judging.weaknesses.forEach((w, i) => {
64+
lines.push(` ${i + 1}. ${w}`)
65+
})
66+
lines.push('')
67+
}
68+
69+
lines.push('METRICS:')
70+
lines.push(minorSeparator)
71+
lines.push(` Duration: ${(durationMs / 1000).toFixed(1)}s`)
72+
lines.push(` Cost: $${cost.toFixed(4)}`)
73+
lines.push('')
74+
75+
if (traceFilePath) {
76+
lines.push(`Trace saved to: ${traceFilePath}`)
77+
lines.push('')
78+
}
79+
80+
lines.push(separator)
81+
lines.push('')
82+
83+
return lines.join('\n')
84+
}
85+
86+
export function formatTraceAnalysis(params: {
87+
commit: EvalCommitV2
88+
overallAnalysis: string
89+
agentFeedback: Array<{
90+
agentId: string
91+
strengths: string[]
92+
weaknesses: string[]
93+
recommendations: string[]
94+
}>
95+
}): string {
96+
const { commit, overallAnalysis, agentFeedback } = params
97+
98+
const lines: string[] = []
99+
const separator = '='.repeat(80)
100+
const minorSeparator = '-'.repeat(80)
101+
102+
lines.push('')
103+
lines.push(separator)
104+
lines.push(`TRACE ANALYSIS: ${commit.id} (${commit.sha.slice(0, 7)})`)
105+
lines.push(separator)
106+
lines.push('')
107+
108+
lines.push('OVERALL ANALYSIS:')
109+
lines.push(minorSeparator)
110+
lines.push(overallAnalysis)
111+
lines.push('')
112+
113+
if (agentFeedback.length > 0) {
114+
lines.push('AGENT-SPECIFIC FEEDBACK:')
115+
lines.push(minorSeparator)
116+
117+
agentFeedback.forEach((feedback, index) => {
118+
if (index > 0) lines.push('')
119+
120+
lines.push(`[${feedback.agentId}]`)
121+
122+
if (feedback.strengths.length > 0) {
123+
lines.push(' Strengths:')
124+
feedback.strengths.forEach((s) => lines.push(` • ${s}`))
125+
}
126+
127+
if (feedback.weaknesses.length > 0) {
128+
lines.push(' Weaknesses:')
129+
feedback.weaknesses.forEach((w) => lines.push(` • ${w}`))
130+
}
131+
132+
if (feedback.recommendations.length > 0) {
133+
lines.push(' Recommendations:')
134+
feedback.recommendations.forEach((r) => lines.push(` • ${r}`))
135+
}
136+
})
137+
138+
lines.push('')
139+
}
140+
141+
lines.push(separator)
142+
lines.push('')
143+
144+
return lines.join('\n')
145+
}

evals/buffbench/run-buffbench.ts

Lines changed: 19 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import { getUserCredentials } from '@codebuff/npm-app/credentials'
66
import pLimit from 'p-limit'
77

88
import { runAgentOnCommit } from './agent-runner'
9+
import { formatAgentResult, formatTraceAnalysis } from './format-output'
910
import { judgeCommitResult } from './judge'
1011
import { analyzeAgentTraces, type AgentTraceData } from './trace-analyzer'
1112
import { CodebuffClient } from '../../sdk/src/client'
@@ -102,18 +103,6 @@ export async function runBuffBench(options: {
102103
error: agentResult.error,
103104
})
104105

105-
console.log(`\n[${agentId}] Judge Results:`)
106-
console.log(` Overall Score: ${judgeResult.overallScore}/10`)
107-
console.log(` Completion: ${judgeResult.completionScore}/10`)
108-
console.log(` Code Quality: ${judgeResult.codeQualityScore}/10`)
109-
console.log(` Analysis: ${judgeResult.analysis}`)
110-
if (judgeResult.strengths.length > 0) {
111-
console.log(` Strengths: ${judgeResult.strengths.join(', ')}`)
112-
}
113-
if (judgeResult.weaknesses.length > 0) {
114-
console.log(` Weaknesses: ${judgeResult.weaknesses.join(', ')}`)
115-
}
116-
117106
const evalRun = {
118107
commitSha: commit.sha,
119108
spec: commit.spec,
@@ -131,6 +120,17 @@ export async function runBuffBench(options: {
131120
const traceFilename = `${safeTaskId}-${safeAgentId}-${safeCommitShort}.json`
132121
const tracePath = path.join(logsDir, traceFilename)
133122

123+
const formattedOutput = formatAgentResult({
124+
agentId,
125+
commit,
126+
judging: judgeResult,
127+
cost: agentResult.cost,
128+
durationMs: agentResult.durationMs,
129+
error: agentResult.error,
130+
traceFilePath: tracePath,
131+
})
132+
console.log(formattedOutput)
133+
134134
const traceData = {
135135
agentId,
136136
commitSha: commit.sha,
@@ -229,30 +229,13 @@ export async function runBuffBench(options: {
229229

230230
const { overallAnalysis, agentFeedback } = analysis
231231
fs.writeFileSync(analysisPath, JSON.stringify(analysisData, null, 2))
232-
console.log(`Analysis saved to ${analysisPath}`)
233-
console.log(`\n=== Trace Analysis ===`)
234-
console.log(overallAnalysis)
235-
if (agentFeedback.length > 0) {
236-
console.log(`\nAgent-Specific Feedback:`)
237-
agentFeedback.forEach((feedback: any) => {
238-
console.log(`\n [${feedback.agentId}]`)
239-
if (feedback.strengths.length > 0) {
240-
console.log(
241-
` Strengths:\n${feedback.strengths.join('\n - ')}}`,
242-
)
243-
}
244-
if (feedback.weaknesses.length > 0) {
245-
console.log(
246-
` Weaknesses:\n${feedback.weaknesses.join('\n - ')}`,
247-
)
248-
}
249-
if (feedback.recommendations.length > 0) {
250-
console.log(
251-
` Recommendations:\n${feedback.recommendations.join('\n - ')}`,
252-
)
253-
}
254-
})
255-
}
232+
233+
const formattedAnalysis = formatTraceAnalysis({
234+
commit,
235+
overallAnalysis,
236+
agentFeedback,
237+
})
238+
console.log(formattedAnalysis)
256239
} catch (error) {
257240
console.error(
258241
`Failed to analyze traces for commit ${commit.sha}:`,

0 commit comments

Comments
 (0)