Skip to content

Commit e5b7fd6

Browse files
committed
misc improvements
1 parent 0de25d7 commit e5b7fd6

File tree

3 files changed

+52
-19
lines changed

3 files changed

+52
-19
lines changed

evals/git-evals2/example.ts

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,10 @@ async function main() {
66

77
const results = await runGitEvals2({
88
evalDataPath: path.join(__dirname, 'eval-codebuff.json'),
9-
agents: ['base', 'base-lite'],
10-
limit: 3,
9+
agents: ['base', 'base2'],
1110
onProgress: (event) => {
12-
if (event.type === 'agent_start') {
13-
console.log(
14-
`[${event.agent}] Starting on commit ${event.commit.slice(0, 7)}...`,
15-
)
16-
} else if (event.type === 'agent_complete') {
17-
console.log(
18-
`[${event.agent}] ✓ Completed with score ${event.score.toFixed(1)}/10`,
19-
)
20-
} else if (event.type === 'agent_error') {
21-
console.log(`[${event.agent}] ✗ Error: ${event.error}`)
11+
if (event.type === 'agent_error') {
12+
console.log(`[${event.agent}] ✗ ${event.evalId} error: ${event.error}`)
2213
}
2314
},
2415
})

evals/git-evals2/run-git-evals2.ts

Lines changed: 46 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ export async function runGitEvals2(options: {
4545
const date = new Date().toISOString().replace(/:/g, '-').slice(0, 16) // YYYY-MM-DDTHH-MM
4646
const outputDir = outputPath
4747
? path.dirname(outputPath)
48-
: 'evals/git-evals2/results'
48+
: path.join(__dirname, 'results')
4949
const logsDir = path.join(outputDir, 'logs', date)
5050
if (!fs.existsSync(logsDir)) {
5151
fs.mkdirSync(logsDir, { recursive: true })
@@ -62,8 +62,10 @@ export async function runGitEvals2(options: {
6262
}
6363

6464
for (const commit of commitsToRun) {
65-
console.log(`\n=== Evaluating ${commit.id} ===`)
66-
console.log(`Prompt: ${commit.prompt.slice(0, 100)}...`)
65+
console.log(
66+
`\n=== Evaluating task: ${commit.id} (${commit.sha.slice(0, 7)}) ===`,
67+
)
68+
console.log(`Prompt: ${commit.prompt}`)
6769

6870
// Store trace data for this commit to analyze later
6971
const commitTraces: AgentTraceData[] = []
@@ -73,6 +75,7 @@ export async function runGitEvals2(options: {
7375
type: 'agent_start',
7476
agent: agentId,
7577
commit: commit.sha,
78+
evalId: commit.id,
7679
})
7780

7881
try {
@@ -93,6 +96,17 @@ export async function runGitEvals2(options: {
9396
error: agentResult.error,
9497
})
9598

99+
console.log(`\n[${agentId}] Judge Results:`)
100+
console.log(` Overall Score: ${judgeResult.overallScore}/10`)
101+
console.log(` Completion: ${judgeResult.completionScore}/10`)
102+
console.log(` Code Quality: ${judgeResult.codeQualityScore}/10`)
103+
if (judgeResult.strengths.length > 0) {
104+
console.log(` Strengths: ${judgeResult.strengths.join(', ')}`)
105+
}
106+
if (judgeResult.weaknesses.length > 0) {
107+
console.log(` Weaknesses: ${judgeResult.weaknesses.join(', ')}`)
108+
}
109+
96110
const evalRun = {
97111
commitSha: commit.sha,
98112
spec: commit.spec,
@@ -133,6 +147,7 @@ export async function runGitEvals2(options: {
133147
type: 'agent_complete',
134148
agent: agentId,
135149
commit: commit.sha,
150+
evalId: commit.id,
136151
score: judgeResult.overallScore,
137152
})
138153

@@ -145,6 +160,7 @@ export async function runGitEvals2(options: {
145160
type: 'agent_error',
146161
agent: agentId,
147162
commit: commit.sha,
163+
evalId: commit.id,
148164
error: errorMessage,
149165
})
150166

@@ -208,9 +224,30 @@ export async function runGitEvals2(options: {
208224
spec: commit.spec,
209225
}
210226

227+
const { overallAnalysis, agentFeedback, recommendations } = analysis
211228
fs.writeFileSync(analysisPath, JSON.stringify(analysisData, null, 2))
212229
console.log(`Analysis saved to ${analysisPath}`)
213-
console.log(`\nOverall Analysis: ${analysis.overallAnalysis}`)
230+
console.log(`\n=== Trace Analysis ===`)
231+
console.log(overallAnalysis)
232+
if (agentFeedback.length > 0) {
233+
console.log(`\nAgent-Specific Feedback:`)
234+
agentFeedback.forEach((feedback: any) => {
235+
console.log(`\n [${feedback.agentId}]`)
236+
if (feedback.strengths.length > 0) {
237+
console.log(` Strengths: ${feedback.strengths.join(', ')}`)
238+
}
239+
if (feedback.weaknesses.length > 0) {
240+
console.log(` Weaknesses: ${feedback.weaknesses.join(', ')}`)
241+
}
242+
console.log(` Performance: ${feedback.relativePerformance}`)
243+
})
244+
}
245+
if (recommendations.length > 0) {
246+
console.log(`\nRecommendations:`)
247+
recommendations.forEach((r: string) =>
248+
console.log(` - ${r}`),
249+
)
250+
}
214251
} catch (error) {
215252
console.error(
216253
`Failed to analyze traces for commit ${commit.sha}:`,
@@ -282,9 +319,11 @@ export async function runGitEvals2(options: {
282319
console.log('\n=== Summary ===')
283320
for (const [agentId, data] of Object.entries(results)) {
284321
console.log(`\n${agentId}:`)
285-
console.log(` Score: ${data.averageScore.toFixed(2)}/10`)
286-
console.log(` Cost: $${data.averageCost.toFixed(4)}`)
287-
console.log(` Duration: ${(data.averageDuration / 1000).toFixed(1)}s`)
322+
console.log(` Average Score: ${data.averageScore.toFixed(2)}/10`)
323+
console.log(` Average Cost: $${data.averageCost.toFixed(4)}`)
324+
console.log(
325+
` Average Duration: ${(data.averageDuration / 1000).toFixed(1)}s`,
326+
)
288327
console.log(
289328
` Success: ${data.runs.filter((r) => !r.error).length}/${data.runs.length}`,
290329
)

evals/git-evals2/types.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,16 +69,19 @@ export type ProgressEvent =
6969
type: 'agent_start'
7070
agent: string
7171
commit: string
72+
evalId: string
7273
}
7374
| {
7475
type: 'agent_complete'
7576
agent: string
7677
commit: string
78+
evalId: string
7779
score: number
7880
}
7981
| {
8082
type: 'agent_error'
8183
agent: string
8284
commit: string
85+
evalId: string
8386
error: string
8487
}

0 commit comments

Comments
 (0)