Skip to content

Commit 3974b74

Browse files
committed
trace analyzer v1
1 parent a2e06b3 commit 3974b74

File tree

2 files changed

+295
-0
lines changed

2 files changed

+295
-0
lines changed

evals/git-evals2/run-git-evals2.ts

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import { CodebuffClient } from '../../sdk/src/client'
77

88
import { runAgentOnCommit } from './agent-runner'
99
import { judgeCommitResult } from './judge'
10+
import { analyzeAgentTraces, type AgentTraceData } from './trace-analyzer'
1011

1112
import type {
1213
EvalData,
@@ -58,6 +59,9 @@ export async function runGitEvals2(
5859
console.log(`\n=== Evaluating commit ${commit.sha.slice(0, 7)} ===`)
5960
console.log(`Spec: ${commit.spec.slice(0, 100)}...`)
6061

62+
// Store trace data for this commit to analyze later
63+
const commitTraces: AgentTraceData[] = []
64+
6165
const agentPromises = agents.map(async (agentId) => {
6266
onProgress?.({
6367
type: 'agent_start',
@@ -119,6 +123,9 @@ export async function runGitEvals2(
119123
fs.writeFileSync(tracePath, JSON.stringify(traceData, null, 2))
120124
console.log(`Trace saved to ${tracePath}`)
121125

126+
// Store for later analysis
127+
commitTraces.push(traceData)
128+
122129
onProgress?.({
123130
type: 'agent_complete',
124131
agent: agentId,
@@ -160,6 +167,45 @@ export async function runGitEvals2(
160167
const agentData = results.get(agentId)!
161168
agentData.runs.push(evalRun)
162169
}
170+
171+
// After all agents complete for this commit, run trace analysis
172+
if (commitTraces.length > 1) {
173+
console.log(
174+
`\n=== Analyzing agent traces for commit ${commit.sha.slice(0, 7)} ===`,
175+
)
176+
try {
177+
const analysis = await analyzeAgentTraces({
178+
client,
179+
traces: commitTraces,
180+
spec: commit.spec,
181+
})
182+
183+
// Save analysis to logs directory
184+
const safeSpec = commit.spec
185+
.split('\n')[0]
186+
.replace(/[^a-zA-Z0-9]/g, '_')
187+
.slice(0, 30)
188+
const safeCommitShort = commit.sha.slice(0, 7)
189+
const analysisFilename = `${safeSpec}-ANALYSIS-${safeCommitShort}.json`
190+
const analysisPath = path.join(logsDir, analysisFilename)
191+
192+
const analysisData = {
193+
commitSha: commit.sha,
194+
spec: commit.spec,
195+
timestamp: new Date().toISOString(),
196+
analysis,
197+
}
198+
199+
fs.writeFileSync(analysisPath, JSON.stringify(analysisData, null, 2))
200+
console.log(`Analysis saved to ${analysisPath}`)
201+
console.log(`\nOverall Analysis: ${analysis.overallAnalysis}`)
202+
} catch (error) {
203+
console.error(
204+
`Failed to analyze traces for commit ${commit.sha}:`,
205+
error,
206+
)
207+
}
208+
}
163209
}
164210

165211
for (const [agentId, agentData] of results) {

evals/git-evals2/trace-analyzer.ts

Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
import type { AgentStep } from './agent-runner'
2+
import type { JudgingResult } from './judge'
3+
import type { AgentDefinition } from '../../sdk/src'
4+
import type { CodebuffClient } from '../../sdk/src/client'
5+
6+
export interface AgentTraceData {
7+
agentId: string
8+
commitSha: string
9+
spec: string
10+
trace: AgentStep[]
11+
diff: string
12+
judgeResult: JudgingResult
13+
cost: number
14+
durationMs: number
15+
error?: string
16+
timestamp: string
17+
}
18+
19+
interface AgentComparison {
20+
overallAnalysis: string
21+
agentFeedback: Array<{
22+
agentId: string
23+
strengths: string[]
24+
weaknesses: string[]
25+
relativePerformance: string
26+
}>
27+
recommendations: string[]
28+
}
29+
30+
function truncateTrace(trace: AgentStep[]): AgentStep[] {
31+
return trace.map((step) => ({
32+
...step,
33+
toolResults: step.toolResults.map((result) => {
34+
// Truncate read_files, run_terminal_command, and code_search results to save tokens
35+
if (result.toolName === 'read_files' && result.output) {
36+
const output = Array.isArray(result.output) ? result.output : [result.output]
37+
const truncatedOutput = output.map((item: any) => {
38+
if (item.type === 'json' && Array.isArray(item.value)) {
39+
// Truncate file contents in read_files results
40+
return {
41+
...item,
42+
value: item.value.map((file: any) => {
43+
if (file.path && file.content) {
44+
return {
45+
path: file.path,
46+
content: '[TRUNCATED - file was read]',
47+
referencedBy: file.referencedBy,
48+
}
49+
}
50+
return file
51+
}),
52+
}
53+
}
54+
return item
55+
})
56+
return {
57+
...result,
58+
output: truncatedOutput,
59+
}
60+
}
61+
62+
// Truncate run_terminal_command results (keep first 500 chars)
63+
if (result.toolName === 'run_terminal_command' && result.output) {
64+
const output = Array.isArray(result.output) ? result.output : [result.output]
65+
const truncatedOutput = output.map((item: any) => {
66+
if (item.type === 'json' && item.value?.stdout) {
67+
return {
68+
...item,
69+
value: {
70+
...item.value,
71+
stdout: item.value.stdout.length > 500
72+
? item.value.stdout.slice(0, 500) + '... [TRUNCATED]'
73+
: item.value.stdout,
74+
},
75+
}
76+
}
77+
return item
78+
})
79+
return {
80+
...result,
81+
output: truncatedOutput,
82+
}
83+
}
84+
85+
// Truncate code_search results (keep first 500 chars)
86+
if (result.toolName === 'code_search' && result.output) {
87+
const output = Array.isArray(result.output) ? result.output : [result.output]
88+
const truncatedOutput = output.map((item: any) => {
89+
if (item.type === 'json' && item.value?.stdout) {
90+
return {
91+
...item,
92+
value: {
93+
...item.value,
94+
stdout: item.value.stdout.length > 500
95+
? item.value.stdout.slice(0, 500) + '... [TRUNCATED]'
96+
: item.value.stdout,
97+
},
98+
}
99+
}
100+
return item
101+
})
102+
return {
103+
...result,
104+
output: truncatedOutput,
105+
}
106+
}
107+
108+
return result
109+
}),
110+
}))
111+
}
112+
113+
const traceAnalyzerAgent: AgentDefinition = {
114+
id: 'git-evals2-trace-analyzer',
115+
displayName: 'Git Evals2 Trace Analyzer',
116+
model: 'anthropic/claude-3.5-sonnet',
117+
toolNames: ['set_output'],
118+
inputSchema: {
119+
prompt: { type: 'string', description: 'The analysis prompt' },
120+
},
121+
outputMode: 'structured_output',
122+
outputSchema: {
123+
type: 'object',
124+
properties: {
125+
overallAnalysis: {
126+
type: 'string',
127+
description: 'Overall comparison of all agents',
128+
},
129+
agentFeedback: {
130+
type: 'array',
131+
items: {
132+
type: 'object',
133+
properties: {
134+
agentId: { type: 'string' },
135+
strengths: {
136+
type: 'array',
137+
items: { type: 'string' },
138+
},
139+
weaknesses: {
140+
type: 'array',
141+
items: { type: 'string' },
142+
},
143+
relativePerformance: {
144+
type: 'string',
145+
description: 'How this agent performed relative to others',
146+
},
147+
},
148+
required: [
149+
'agentId',
150+
'strengths',
151+
'weaknesses',
152+
'relativePerformance',
153+
],
154+
},
155+
},
156+
recommendations: {
157+
type: 'array',
158+
items: { type: 'string' },
159+
description: 'Recommendations for improving agents',
160+
},
161+
},
162+
required: ['overallAnalysis', 'agentFeedback', 'recommendations'],
163+
},
164+
systemPrompt: `You are an expert AI agent evaluator comparing multiple coding agents on the same task.
165+
166+
## Your Role
167+
168+
You will receive:
169+
1. A task specification
170+
2. Full traces from each agent showing their approach and execution
171+
3. Results including:
172+
- Judge results (completion score, code quality score, overall score, analysis, strengths, weaknesses)
173+
- Cost efficiency
174+
- Time efficiency
175+
- Whether they produced valid diffs
176+
- Any errors encountered
177+
- Number of trace steps taken
178+
179+
## Analysis Criteria
180+
181+
Provide:
182+
- **Overall Analysis**: Compare how agents performed on this task, analyzing their different approaches
183+
- **Agent Feedback**: For each agent, list:
184+
- Strengths: What this agent did well (specific actions from trace)
185+
- Weaknesses: What this agent struggled with (specific issues from trace)
186+
- Relative Performance: How this agent compared to others
187+
- **Recommendations**: Actionable suggestions for improving the agents based on observed behavior
188+
189+
Focus on comparative insights - how agents differ in their approaches, tool usage patterns, efficiency, and results.
190+
Note: read_files tool results show [TRUNCATED] for file contents to save space.`,
191+
}
192+
193+
export async function analyzeAgentTraces({
194+
client,
195+
traces,
196+
spec,
197+
}: {
198+
client: CodebuffClient
199+
traces: AgentTraceData[]
200+
spec: string
201+
}): Promise<AgentComparison> {
202+
const truncatedTraces = traces.map((t) => ({
203+
agentId: t.agentId,
204+
trace: truncateTrace(t.trace),
205+
judgeResult: t.judgeResult,
206+
cost: t.cost,
207+
durationMs: t.durationMs,
208+
error: t.error,
209+
}))
210+
211+
const prompt = `## Task Specification
212+
${spec}
213+
214+
## Agent Traces and Results
215+
${JSON.stringify(truncatedTraces, null, 2)}
216+
217+
Please compare these agents and provide:
218+
1. An overall analysis of how the agents performed, including differences in their approaches
219+
2. Specific feedback for each agent including strengths, weaknesses, and how they performed relative to others
220+
3. Recommendations for improving the agents
221+
222+
Focus on:
223+
- Judge results (completion score, code quality score, overall score, analysis, strengths, weaknesses)
224+
- Approach and tool usage patterns from the traces
225+
- Cost efficiency
226+
- Time efficiency
227+
- Whether they produced valid diffs
228+
- Any errors encountered`
229+
230+
const analyzerResult = await client.run({
231+
agent: 'git-evals2-trace-analyzer',
232+
prompt,
233+
agentDefinitions: [traceAnalyzerAgent],
234+
})
235+
236+
if (analyzerResult.output.type !== 'structuredOutput') {
237+
console.error(
238+
'Error running trace analyzer - not structured output',
239+
JSON.stringify(analyzerResult.output, null, 2),
240+
)
241+
return {
242+
overallAnalysis: 'Error running trace analyzer - not structured output',
243+
agentFeedback: [],
244+
recommendations: ['Trace analyzer failed to provide structured output'],
245+
}
246+
}
247+
248+
return analyzerResult.output.value as AgentComparison
249+
}

0 commit comments

Comments
 (0)