Skip to content

Commit 1233cd1

Browse files
committed
Store agent trace
1 parent 265df6d commit 1233cd1

File tree

2 files changed

+74
-3
lines changed

2 files changed

+74
-3
lines changed

evals/git-evals2/agent-runner.ts

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,18 @@ import { withTestRepo } from '../subagents/test-repo-utils'
77

88
import type { EvalCommit } from './types'
99

10+
export interface AgentStep {
11+
response: string
12+
toolCalls: any[]
13+
toolResults: any[]
14+
}
15+
1016
export interface AgentRunResult {
1117
diff: string
1218
durationMs: number
1319
cost: number
1420
error?: string
21+
trace: AgentStep[]
1522
}
1623

1724
export async function runAgentOnCommit({
@@ -31,6 +38,7 @@ export async function runAgentOnCommit({
3138
let diff = ''
3239
let error: string | undefined
3340
let cost = 0
41+
const trace: AgentStep[] = []
3442

3543
try {
3644
await withTestRepo(
@@ -45,13 +53,44 @@ export async function runAgentOnCommit({
4553
await loadLocalAgents({ agentsPath }),
4654
)
4755

56+
let responseText = ''
57+
let toolCalls: any[] = []
58+
let toolResults: any[] = []
59+
60+
function flushStep() {
61+
if (responseText.length > 0 || toolCalls.length > 0 || toolResults.length > 0) {
62+
trace.push({ response: responseText, toolCalls, toolResults })
63+
responseText = ''
64+
toolCalls = []
65+
toolResults = []
66+
}
67+
}
68+
4869
const result = await client.run({
4970
agent: agentId,
5071
prompt: commit.spec,
5172
agentDefinitions: localAgentDefinitions,
5273
cwd: repoDir,
74+
handleEvent: (event) => {
75+
if (event.type === 'text') {
76+
if (toolResults.length > 0) {
77+
flushStep()
78+
}
79+
responseText += event.text
80+
} else if (event.type === 'tool_call') {
81+
if (event.toolName === 'set_messages') {
82+
return
83+
}
84+
toolCalls.push(event)
85+
} else if (event.type === 'tool_result') {
86+
toolResults.push(event)
87+
} else if (event.type === 'finish') {
88+
flushStep()
89+
}
90+
},
5391
})
5492

93+
flushStep()
5594
cost = result.sessionState.mainAgentState.creditsUsed / 100
5695

5796
execSync('git add .', { cwd: repoDir, stdio: 'ignore' })
@@ -72,5 +111,6 @@ export async function runAgentOnCommit({
72111
durationMs,
73112
cost,
74113
error,
114+
trace,
75115
}
76116
}

evals/git-evals2/run-git-evals2.ts

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,7 @@ export async function runGitEvals2(
2020
): Promise<GitEvals2Result> {
2121
const { evalDataPath, agents, outputPath, limit, onProgress } = options
2222

23-
const evalData: EvalData = JSON.parse(
24-
fs.readFileSync(evalDataPath, 'utf-8'),
25-
)
23+
const evalData: EvalData = JSON.parse(fs.readFileSync(evalDataPath, 'utf-8'))
2624
const commitsToRun = limit
2725
? evalData.evalCommits.slice(0, limit)
2826
: evalData.evalCommits
@@ -36,6 +34,16 @@ export async function runGitEvals2(
3634
const startTime = Date.now()
3735
const results = new Map<string, AgentEvalResults>()
3836

37+
// Create logs directory with current date and time
38+
const date = new Date().toISOString().replace(/:/g, '-').slice(0, 16) // YYYY-MM-DDTHH-MM
39+
const outputDir = outputPath
40+
? path.dirname(outputPath)
41+
: 'evals/git-evals2/results'
42+
const logsDir = path.join(outputDir, 'logs', date)
43+
if (!fs.existsSync(logsDir)) {
44+
fs.mkdirSync(logsDir, { recursive: true })
45+
}
46+
3947
for (const agentId of agents) {
4048
results.set(agentId, {
4149
agentId,
@@ -85,6 +93,28 @@ export async function runGitEvals2(
8593
error: agentResult.error,
8694
}
8795

96+
// Save trace to logs directory
97+
const safeAgentId = agentId.replace(/[^a-zA-Z0-9-]/g, '_')
98+
const safeCommitShort = commit.sha.slice(0, 7)
99+
const traceFilename = `${safeAgentId}-${safeCommitShort}.json`
100+
const tracePath = path.join(logsDir, traceFilename)
101+
102+
const traceData = {
103+
agentId,
104+
commitSha: commit.sha,
105+
spec: commit.spec,
106+
trace: agentResult.trace,
107+
diff: agentResult.diff,
108+
judgeResult,
109+
cost: agentResult.cost,
110+
durationMs: agentResult.durationMs,
111+
error: agentResult.error,
112+
timestamp: new Date().toISOString(),
113+
}
114+
115+
fs.writeFileSync(tracePath, JSON.stringify(traceData, null, 2))
116+
console.log(`Trace saved to ${tracePath}`)
117+
88118
onProgress?.({
89119
type: 'agent_complete',
90120
agent: agentId,
@@ -172,6 +202,7 @@ export async function runGitEvals2(
172202
console.log(`\nResults written to ${outputPath}`)
173203
}
174204

205+
console.log(`\nTraces saved to ${logsDir}`)
175206
console.log('\n=== Summary ===')
176207
for (const [agentId, data] of results) {
177208
console.log(`\n${agentId}:`)

0 commit comments

Comments
 (0)