Skip to content

Commit 3dd6318

Browse files
committed
More output tweaks
1 parent 35a3a38 commit 3dd6318

File tree

2 files changed

+50
-34
lines changed

2 files changed

+50
-34
lines changed

evals/buffbench/format-output.ts

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,20 +9,28 @@ export function formatAgentResult(params: {
99
durationMs: number
1010
error?: string
1111
traceFilePath?: string
12+
agentNumber: number
13+
totalAgents: number
1214
}): string {
13-
const { agentId, commit, judging, cost, durationMs, error, traceFilePath } =
14-
params
15+
const {
16+
agentId,
17+
commit,
18+
judging,
19+
cost,
20+
durationMs,
21+
error,
22+
traceFilePath,
23+
agentNumber,
24+
totalAgents,
25+
} = params
1526

1627
const lines: string[] = []
17-
const separator = '='.repeat(80)
1828
const minorSeparator = '-'.repeat(80)
1929

2030
lines.push('')
21-
lines.push(separator)
22-
lines.push(
23-
`AGENT RESULT: [${agentId}] - ${commit.id} (${commit.sha.slice(0, 7)})`,
24-
)
25-
lines.push(separator)
31+
lines.push(minorSeparator)
32+
lines.push(`AGENT ${agentNumber}/${totalAgents}: [${agentId}]`)
33+
lines.push(minorSeparator)
2634
lines.push('')
2735

2836
lines.push('TASK:')
@@ -77,9 +85,6 @@ export function formatAgentResult(params: {
7785
lines.push('')
7886
}
7987

80-
lines.push(separator)
81-
lines.push('')
82-
8388
return lines.join('\n')
8489
}
8590

evals/buffbench/run-buffbench.ts

Lines changed: 34 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,10 @@ export async function runBuffBench(options: {
6767

6868
const commitLimit = pLimit(commitConcurrency)
6969

70-
const commitPromises = commitsToRun.map((commit) =>
70+
const commitPromises = commitsToRun.map((commit, index) =>
7171
commitLimit(async () => {
7272
console.log(
73-
`\n=== Evaluating task: ${commit.id} (${commit.sha.slice(0, 7)}) ===`,
73+
`\n=== Task ${index + 1}/${commitsToRun.length}: ${commit.id} (${commit.sha.slice(0, 7)}) ===`,
7474
)
7575
console.log(`Prompt: ${commit.prompt}`)
7676

@@ -120,18 +120,8 @@ export async function runBuffBench(options: {
120120
const traceFilename = `${safeTaskId}-${safeAgentId}-${safeCommitShort}.json`
121121
const tracePath = path.join(logsDir, traceFilename)
122122

123-
const formattedOutput = formatAgentResult({
124-
agentId,
125-
commit,
126-
judging: judgeResult,
127-
cost: agentResult.cost,
128-
durationMs: agentResult.durationMs,
129-
error: agentResult.error,
130-
traceFilePath: tracePath,
131-
})
132-
console.log(formattedOutput)
133-
134-
const traceData = {
123+
// Store judging result and trace for combined output later
124+
commitTraces.push({
135125
agentId,
136126
commitSha: commit.sha,
137127
spec: commit.spec,
@@ -142,13 +132,12 @@ export async function runBuffBench(options: {
142132
durationMs: agentResult.durationMs,
143133
error: agentResult.error,
144134
timestamp: new Date().toISOString(),
145-
}
146-
147-
fs.writeFileSync(tracePath, JSON.stringify(traceData, null, 2))
148-
console.log(`Trace saved to ${tracePath}`)
135+
})
149136

150-
// Store for later analysis
151-
commitTraces.push(traceData)
137+
fs.writeFileSync(
138+
tracePath,
139+
JSON.stringify(commitTraces[commitTraces.length - 1], null, 2),
140+
)
152141

153142
onProgress?.({
154143
type: 'agent_complete',
@@ -197,9 +186,6 @@ export async function runBuffBench(options: {
197186

198187
// After all agents complete for this commit, run trace analysis
199188
if (commitTraces.length > 1) {
200-
console.log(
201-
`\n=== Analyzing agent traces for ${commit.id} (${commit.sha.slice(0, 7)}) ===`,
202-
)
203189
try {
204190
const analysis = await analyzeAgentTraces({
205191
client,
@@ -230,6 +216,31 @@ export async function runBuffBench(options: {
230216
const { overallAnalysis, agentFeedback } = analysis
231217
fs.writeFileSync(analysisPath, JSON.stringify(analysisData, null, 2))
232218

219+
// Print all agent results with their judging, then trace analysis together
220+
console.log('\n' + '='.repeat(80))
221+
console.log(
222+
`RESULTS FOR TASK ${index + 1}/${commitsToRun.length}: ${commit.id} (${commit.sha.slice(0, 7)})`,
223+
)
224+
console.log('='.repeat(80))
225+
226+
commitTraces.forEach((trace, traceIndex) => {
227+
const formattedOutput = formatAgentResult({
228+
agentId: trace.agentId,
229+
commit,
230+
judging: trace.judgeResult,
231+
cost: trace.cost,
232+
durationMs: trace.durationMs,
233+
error: trace.error,
234+
traceFilePath: path.join(
235+
logsDir,
236+
`${commit.id.replace(/[^a-zA-Z0-9-]/g, '_')}-${trace.agentId.replace(/[^a-zA-Z0-9-]/g, '_')}-${commit.sha.slice(0, 7)}.json`,
237+
),
238+
agentNumber: traceIndex + 1,
239+
totalAgents: commitTraces.length,
240+
})
241+
console.log(formattedOutput)
242+
})
243+
233244
const formattedAnalysis = formatTraceAnalysis({
234245
commit,
235246
overallAnalysis,

0 commit comments

Comments
 (0)