@@ -45,7 +45,7 @@ export async function runGitEvals2(options: {
4545 const date = new Date ( ) . toISOString ( ) . replace ( / : / g, '-' ) . slice ( 0 , 16 ) // YYYY-MM-DDTHH-MM
4646 const outputDir = outputPath
4747 ? path . dirname ( outputPath )
48- : 'evals/git-evals2/ results'
48+ : path . join ( __dirname , ' results')
4949 const logsDir = path . join ( outputDir , 'logs' , date )
5050 if ( ! fs . existsSync ( logsDir ) ) {
5151 fs . mkdirSync ( logsDir , { recursive : true } )
@@ -62,8 +62,10 @@ export async function runGitEvals2(options: {
6262 }
6363
6464 for ( const commit of commitsToRun ) {
65- console . log ( `\n=== Evaluating ${ commit . id } ===` )
66- console . log ( `Prompt: ${ commit . prompt . slice ( 0 , 100 ) } ...` )
65+ console . log (
66+ `\n=== Evaluating task: ${ commit . id } (${ commit . sha . slice ( 0 , 7 ) } ) ===` ,
67+ )
68+ console . log ( `Prompt: ${ commit . prompt } ` )
6769
6870 // Store trace data for this commit to analyze later
6971 const commitTraces : AgentTraceData [ ] = [ ]
@@ -73,6 +75,7 @@ export async function runGitEvals2(options: {
7375 type : 'agent_start' ,
7476 agent : agentId ,
7577 commit : commit . sha ,
78+ evalId : commit . id ,
7679 } )
7780
7881 try {
@@ -93,6 +96,17 @@ export async function runGitEvals2(options: {
9396 error : agentResult . error ,
9497 } )
9598
99+ console . log ( `\n[${ agentId } ] Judge Results:` )
100+ console . log ( ` Overall Score: ${ judgeResult . overallScore } /10` )
101+ console . log ( ` Completion: ${ judgeResult . completionScore } /10` )
102+ console . log ( ` Code Quality: ${ judgeResult . codeQualityScore } /10` )
103+ if ( judgeResult . strengths . length > 0 ) {
104+ console . log ( ` Strengths: ${ judgeResult . strengths . join ( ', ' ) } ` )
105+ }
106+ if ( judgeResult . weaknesses . length > 0 ) {
107+ console . log ( ` Weaknesses: ${ judgeResult . weaknesses . join ( ', ' ) } ` )
108+ }
109+
96110 const evalRun = {
97111 commitSha : commit . sha ,
98112 spec : commit . spec ,
@@ -133,6 +147,7 @@ export async function runGitEvals2(options: {
133147 type : 'agent_complete' ,
134148 agent : agentId ,
135149 commit : commit . sha ,
150+ evalId : commit . id ,
136151 score : judgeResult . overallScore ,
137152 } )
138153
@@ -145,6 +160,7 @@ export async function runGitEvals2(options: {
145160 type : 'agent_error' ,
146161 agent : agentId ,
147162 commit : commit . sha ,
163+ evalId : commit . id ,
148164 error : errorMessage ,
149165 } )
150166
@@ -208,9 +224,30 @@ export async function runGitEvals2(options: {
208224 spec : commit . spec ,
209225 }
210226
227+ const { overallAnalysis, agentFeedback, recommendations } = analysis
211228 fs . writeFileSync ( analysisPath , JSON . stringify ( analysisData , null , 2 ) )
212229 console . log ( `Analysis saved to ${ analysisPath } ` )
213- console . log ( `\nOverall Analysis: ${ analysis . overallAnalysis } ` )
230+ console . log ( `\n=== Trace Analysis ===` )
231+ console . log ( overallAnalysis )
232+ if ( agentFeedback . length > 0 ) {
233+ console . log ( `\nAgent-Specific Feedback:` )
234+ agentFeedback . forEach ( ( feedback : any ) => {
235+ console . log ( `\n [${ feedback . agentId } ]` )
236+ if ( feedback . strengths . length > 0 ) {
237+ console . log ( ` Strengths: ${ feedback . strengths . join ( ', ' ) } ` )
238+ }
239+ if ( feedback . weaknesses . length > 0 ) {
240+ console . log ( ` Weaknesses: ${ feedback . weaknesses . join ( ', ' ) } ` )
241+ }
242+ console . log ( ` Performance: ${ feedback . relativePerformance } ` )
243+ } )
244+ }
245+ if ( recommendations . length > 0 ) {
246+ console . log ( `\nRecommendations:` )
247+ recommendations . forEach ( ( r : string ) =>
248+ console . log ( ` - ${ r } ` ) ,
249+ )
250+ }
214251 } catch ( error ) {
215252 console . error (
216253 `Failed to analyze traces for commit ${ commit . sha } :` ,
@@ -282,9 +319,11 @@ export async function runGitEvals2(options: {
282319 console . log ( '\n=== Summary ===' )
283320 for ( const [ agentId , data ] of Object . entries ( results ) ) {
284321 console . log ( `\n${ agentId } :` )
285- console . log ( ` Score: ${ data . averageScore . toFixed ( 2 ) } /10` )
286- console . log ( ` Cost: $${ data . averageCost . toFixed ( 4 ) } ` )
287- console . log ( ` Duration: ${ ( data . averageDuration / 1000 ) . toFixed ( 1 ) } s` )
322+ console . log ( ` Average Score: ${ data . averageScore . toFixed ( 2 ) } /10` )
323+ console . log ( ` Average Cost: $${ data . averageCost . toFixed ( 4 ) } ` )
324+ console . log (
325+ ` Average Duration: ${ ( data . averageDuration / 1000 ) . toFixed ( 1 ) } s` ,
326+ )
288327 console . log (
289328 ` Success: ${ data . runs . filter ( ( r ) => ! r . error ) . length } /${ data . runs . length } ` ,
290329 )
0 commit comments