@@ -67,10 +67,10 @@ export async function runBuffBench(options: {
6767
6868 const commitLimit = pLimit ( commitConcurrency )
6969
70- const commitPromises = commitsToRun . map ( ( commit ) =>
70+ const commitPromises = commitsToRun . map ( ( commit , index ) =>
7171 commitLimit ( async ( ) => {
7272 console . log (
73- `\n=== Evaluating task : ${ commit . id } (${ commit . sha . slice ( 0 , 7 ) } ) ===` ,
73+ `\n=== Task ${ index + 1 } / ${ commitsToRun . length } : ${ commit . id } (${ commit . sha . slice ( 0 , 7 ) } ) ===` ,
7474 )
7575 console . log ( `Prompt: ${ commit . prompt } ` )
7676
@@ -120,18 +120,8 @@ export async function runBuffBench(options: {
120120 const traceFilename = `${ safeTaskId } -${ safeAgentId } -${ safeCommitShort } .json`
121121 const tracePath = path . join ( logsDir , traceFilename )
122122
123- const formattedOutput = formatAgentResult ( {
124- agentId,
125- commit,
126- judging : judgeResult ,
127- cost : agentResult . cost ,
128- durationMs : agentResult . durationMs ,
129- error : agentResult . error ,
130- traceFilePath : tracePath ,
131- } )
132- console . log ( formattedOutput )
133-
134- const traceData = {
123+ // Store judging result and trace for combined output later
124+ commitTraces . push ( {
135125 agentId,
136126 commitSha : commit . sha ,
137127 spec : commit . spec ,
@@ -142,13 +132,12 @@ export async function runBuffBench(options: {
142132 durationMs : agentResult . durationMs ,
143133 error : agentResult . error ,
144134 timestamp : new Date ( ) . toISOString ( ) ,
145- }
146-
147- fs . writeFileSync ( tracePath , JSON . stringify ( traceData , null , 2 ) )
148- console . log ( `Trace saved to ${ tracePath } ` )
135+ } )
149136
150- // Store for later analysis
151- commitTraces . push ( traceData )
137+ fs . writeFileSync (
138+ tracePath ,
139+ JSON . stringify ( commitTraces [ commitTraces . length - 1 ] , null , 2 ) ,
140+ )
152141
153142 onProgress ?.( {
154143 type : 'agent_complete' ,
@@ -197,9 +186,6 @@ export async function runBuffBench(options: {
197186
198187 // After all agents complete for this commit, run trace analysis
199188 if ( commitTraces . length > 1 ) {
200- console . log (
201- `\n=== Analyzing agent traces for ${ commit . id } (${ commit . sha . slice ( 0 , 7 ) } ) ===` ,
202- )
203189 try {
204190 const analysis = await analyzeAgentTraces ( {
205191 client,
@@ -230,6 +216,31 @@ export async function runBuffBench(options: {
230216 const { overallAnalysis, agentFeedback } = analysis
231217 fs . writeFileSync ( analysisPath , JSON . stringify ( analysisData , null , 2 ) )
232218
219+ // Print all agent results with their judging, then trace analysis together
220+ console . log ( '\n' + '=' . repeat ( 80 ) )
221+ console . log (
222+ `RESULTS FOR TASK ${ index + 1 } /${ commitsToRun . length } : ${ commit . id } (${ commit . sha . slice ( 0 , 7 ) } )` ,
223+ )
224+ console . log ( '=' . repeat ( 80 ) )
225+
226+ commitTraces . forEach ( ( trace , traceIndex ) => {
227+ const formattedOutput = formatAgentResult ( {
228+ agentId : trace . agentId ,
229+ commit,
230+ judging : trace . judgeResult ,
231+ cost : trace . cost ,
232+ durationMs : trace . durationMs ,
233+ error : trace . error ,
234+ traceFilePath : path . join (
235+ logsDir ,
236+ `${ commit . id . replace ( / [ ^ a - z A - Z 0 - 9 - ] / g, '_' ) } -${ trace . agentId . replace ( / [ ^ a - z A - Z 0 - 9 - ] / g, '_' ) } -${ commit . sha . slice ( 0 , 7 ) } .json` ,
237+ ) ,
238+ agentNumber : traceIndex + 1 ,
239+ totalAgents : commitTraces . length ,
240+ } )
241+ console . log ( formattedOutput )
242+ } )
243+
233244 const formattedAnalysis = formatTraceAnalysis ( {
234245 commit,
235246 overallAnalysis,
0 commit comments