Skip to content

Commit 45de9de

Browse files
committed
Misc improvments: write a file result log. Use gpt5 for trace analysis
1 parent 337156e commit 45de9de

File tree

3 files changed

+60
-32
lines changed

3 files changed

+60
-32
lines changed

evals/git-evals2/example.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ async function main() {
77
const results = await runGitEvals2({
88
evalDataPath: path.join(__dirname, '../git-evals/eval-codebuff2.json'),
99
agents: ['base', 'base-lite'],
10-
outputPath: path.join(__dirname, '../git-evals2/example-results.json'),
1110
limit: 3,
1211
onProgress: (event) => {
1312
if (event.type === 'agent_start') {

evals/git-evals2/run-git-evals2.ts

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -198,16 +198,16 @@ export async function runGitEvals2(options: {
198198

199199
const analysisData = {
200200
commitSha: commit.sha,
201-
spec: commit.spec,
202201
timestamp: new Date().toISOString(),
203-
analysis,
202+
...analysis,
204203
results: commitTraces.map((t) => ({
205204
agentId: t.agentId,
206205
...t.judgeResult,
207206
cost: t.cost,
208207
durationMs: t.durationMs,
209208
error: t.error,
210209
})),
210+
spec: commit.spec,
211211
}
212212

213213
fs.writeFileSync(analysisPath, JSON.stringify(analysisData, null, 2))
@@ -259,7 +259,28 @@ export async function runGitEvals2(options: {
259259
console.log(`\nResults written to ${outputPath}`)
260260
}
261261

262-
console.log(`\nTraces saved to ${logsDir}`)
262+
const logFiles = fs.readdirSync(logsDir)
263+
264+
const finalResults = {
265+
metadata: {
266+
timestamp: result.timestamp,
267+
evalDataPath,
268+
agentsTested: agents,
269+
commitsEvaluated: commitsToRun.length,
270+
totalCommitsInEval: evalData.evalCommits.length,
271+
repoUrl: evalData.repoUrl,
272+
initCommand: evalData.initCommand,
273+
totalDuration: result.totalDuration,
274+
logsDirectory: logsDir,
275+
files: logFiles,
276+
},
277+
...result.agents,
278+
}
279+
280+
const finalResultsPath = path.join(logsDir, 'FINAL_RESULTS.json')
281+
fs.writeFileSync(finalResultsPath, JSON.stringify(finalResults, null, 2))
282+
283+
console.log(`Traces saved to ${logsDir}`)
263284
console.log('\n=== Summary ===')
264285
for (const [agentId, data] of Object.entries(results)) {
265286
console.log(`\n${agentId}:`)

evals/git-evals2/trace-analyzer.ts

Lines changed: 36 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -16,24 +16,15 @@ export interface AgentTraceData {
1616
timestamp: string
1717
}
1818

19-
interface AgentComparison {
20-
overallAnalysis: string
21-
agentFeedback: Array<{
22-
agentId: string
23-
strengths: string[]
24-
weaknesses: string[]
25-
relativePerformance: string
26-
}>
27-
recommendations: string[]
28-
}
29-
3019
function truncateTrace(trace: AgentStep[]): AgentStep[] {
3120
return trace.map((step) => ({
3221
...step,
3322
toolResults: step.toolResults.map((result) => {
3423
// Truncate read_files, run_terminal_command, and code_search results to save tokens
3524
if (result.toolName === 'read_files' && result.output) {
36-
const output = Array.isArray(result.output) ? result.output : [result.output]
25+
const output = Array.isArray(result.output)
26+
? result.output
27+
: [result.output]
3728
const truncatedOutput = output.map((item: any) => {
3829
if (item.type === 'json' && Array.isArray(item.value)) {
3930
// Truncate file contents in read_files results
@@ -58,19 +49,22 @@ function truncateTrace(trace: AgentStep[]): AgentStep[] {
5849
output: truncatedOutput,
5950
}
6051
}
61-
52+
6253
// Truncate run_terminal_command results (keep first 500 chars)
6354
if (result.toolName === 'run_terminal_command' && result.output) {
64-
const output = Array.isArray(result.output) ? result.output : [result.output]
55+
const output = Array.isArray(result.output)
56+
? result.output
57+
: [result.output]
6558
const truncatedOutput = output.map((item: any) => {
6659
if (item.type === 'json' && item.value?.stdout) {
6760
return {
6861
...item,
6962
value: {
7063
...item.value,
71-
stdout: item.value.stdout.length > 500
72-
? item.value.stdout.slice(0, 500) + '... [TRUNCATED]'
73-
: item.value.stdout,
64+
stdout:
65+
item.value.stdout.length > 500
66+
? item.value.stdout.slice(0, 500) + '... [TRUNCATED]'
67+
: item.value.stdout,
7468
},
7569
}
7670
}
@@ -81,19 +75,22 @@ function truncateTrace(trace: AgentStep[]): AgentStep[] {
8175
output: truncatedOutput,
8276
}
8377
}
84-
78+
8579
// Truncate code_search results (keep first 500 chars)
8680
if (result.toolName === 'code_search' && result.output) {
87-
const output = Array.isArray(result.output) ? result.output : [result.output]
81+
const output = Array.isArray(result.output)
82+
? result.output
83+
: [result.output]
8884
const truncatedOutput = output.map((item: any) => {
8985
if (item.type === 'json' && item.value?.stdout) {
9086
return {
9187
...item,
9288
value: {
9389
...item.value,
94-
stdout: item.value.stdout.length > 500
95-
? item.value.stdout.slice(0, 500) + '... [TRUNCATED]'
96-
: item.value.stdout,
90+
stdout:
91+
item.value.stdout.length > 500
92+
? item.value.stdout.slice(0, 500) + '... [TRUNCATED]'
93+
: item.value.stdout,
9794
},
9895
}
9996
}
@@ -104,7 +101,7 @@ function truncateTrace(trace: AgentStep[]): AgentStep[] {
104101
output: truncatedOutput,
105102
}
106103
}
107-
104+
108105
return result
109106
}),
110107
}))
@@ -113,7 +110,7 @@ function truncateTrace(trace: AgentStep[]): AgentStep[] {
113110
const traceAnalyzerAgent: AgentDefinition = {
114111
id: 'git-evals2-trace-analyzer',
115112
displayName: 'Git Evals2 Trace Analyzer',
116-
model: 'anthropic/claude-3.5-sonnet',
113+
model: 'openai/gpt-5',
117114
toolNames: ['set_output'],
118115
inputSchema: {
119116
prompt: { type: 'string', description: 'The analysis prompt' },
@@ -205,7 +202,16 @@ export async function analyzeAgentTraces({
205202
client: CodebuffClient
206203
traces: AgentTraceData[]
207204
spec: string
208-
}): Promise<AgentComparison> {
205+
}): Promise<{
206+
overallAnalysis: string
207+
agentFeedback: Array<{
208+
agentId: string
209+
strengths: string[]
210+
weaknesses: string[]
211+
relativePerformance: string
212+
}>
213+
recommendations: string[]
214+
}> {
209215
const truncatedTraces = traces.map((t) => ({
210216
agentId: t.agentId,
211217
trace: truncateTrace(t.trace),
@@ -247,10 +253,12 @@ Focus on the HOW, not the WHAT: We want to understand and improve how agents wor
247253
agentDefinitions: [traceAnalyzerAgent],
248254
})
249255

250-
if (analyzerResult.output.type !== 'structuredOutput') {
256+
const { output } = analyzerResult
257+
258+
if (output.type !== 'structuredOutput' || output.value === null) {
251259
console.error(
252260
'Error running trace analyzer - not structured output',
253-
JSON.stringify(analyzerResult.output, null, 2),
261+
JSON.stringify(output, null, 2),
254262
)
255263
return {
256264
overallAnalysis: 'Error running trace analyzer - not structured output',
@@ -259,5 +267,5 @@ Focus on the HOW, not the WHAT: We want to understand and improve how agents wor
259267
}
260268
}
261269

262-
return analyzerResult.output.value as AgentComparison
270+
return output.value as any
263271
}

0 commit comments

Comments
 (0)