Skip to content

Commit 4918fe5

Browse files
committed
Judge based on prompt not spec, pass in context files
1 parent 320fd73 commit 4918fe5

File tree

3 files changed

+107
-56
lines changed

3 files changed

+107
-56
lines changed

evals/git-evals2/agent-runner.ts

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import { loadLocalAgents } from '@codebuff/npm-app/agents/load-agents'
55
import { CodebuffClient } from '../../sdk/src/client'
66
import { withTestRepo } from '../subagents/test-repo-utils'
77

8-
import type { EvalCommit } from './types'
8+
import type { EvalCommitV2 } from './types'
99

1010
export interface AgentStep {
1111
response: string
@@ -15,6 +15,7 @@ export interface AgentStep {
1515

1616
export interface AgentRunResult {
1717
diff: string
18+
contextFiles: Record<string, string>
1819
durationMs: number
1920
cost: number
2021
error?: string
@@ -30,12 +31,14 @@ export async function runAgentOnCommit({
3031
}: {
3132
client: CodebuffClient
3233
agentId: string
33-
commit: EvalCommit
34+
commit: EvalCommitV2
3435
repoUrl: string
3536
initCommand?: string
3637
}): Promise<AgentRunResult> {
38+
console.log(`[${commit.id}] Running agent ${agentId}...`)
3739
const startTime = Date.now()
3840
let diff = ''
41+
let contextFiles: Record<string, string> = {}
3942
let error: string | undefined
4043
let cost = 0
4144
const trace: AgentStep[] = []
@@ -56,9 +59,13 @@ export async function runAgentOnCommit({
5659
let responseText = ''
5760
let toolCalls: any[] = []
5861
let toolResults: any[] = []
59-
62+
6063
function flushStep() {
61-
if (responseText.length > 0 || toolCalls.length > 0 || toolResults.length > 0) {
64+
if (
65+
responseText.length > 0 ||
66+
toolCalls.length > 0 ||
67+
toolResults.length > 0
68+
) {
6269
trace.push({ response: responseText, toolCalls, toolResults })
6370
responseText = ''
6471
toolCalls = []
@@ -68,7 +75,7 @@ export async function runAgentOnCommit({
6875

6976
const result = await client.run({
7077
agent: agentId,
71-
prompt: commit.spec,
78+
prompt: commit.prompt,
7279
agentDefinitions: localAgentDefinitions,
7380
cwd: repoDir,
7481
handleEvent: (event) => {
@@ -98,6 +105,27 @@ export async function runAgentOnCommit({
98105
cwd: repoDir,
99106
encoding: 'utf-8',
100107
})
108+
109+
const contextFilePaths = new Set<string>([
110+
...commit.supplementalFiles,
111+
...commit.fileDiffs.map((fd) => fd.path),
112+
])
113+
114+
for (const filePath of contextFilePaths) {
115+
try {
116+
const content = execSync(
117+
`git show ${commit.parentSha}:${JSON.stringify(filePath)}`,
118+
{
119+
cwd: repoDir,
120+
encoding: 'utf-8',
121+
maxBuffer: 10 * 1024 * 1024,
122+
},
123+
)
124+
contextFiles[filePath] = content
125+
} catch (error) {
126+
contextFiles[filePath] = ''
127+
}
128+
}
101129
},
102130
)
103131
} catch (e) {
@@ -108,6 +136,7 @@ export async function runAgentOnCommit({
108136

109137
return {
110138
diff,
139+
contextFiles,
111140
durationMs,
112141
cost,
113142
error,

evals/git-evals2/judge.ts

Lines changed: 57 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,22 @@
1-
import { createTwoFilesPatch } from 'diff'
21
import { z } from 'zod/v4'
32

4-
import type { FileState } from './types'
3+
import type { FileDiff } from './types'
54
import type { AgentDefinition } from '../../sdk/src'
65
import type { CodebuffClient } from '../../sdk/src/client'
76

87
export const JudgingResultSchema = z.object({
98
analysis: z
109
.string()
1110
.describe('Detailed analysis comparing agent changes to ground truth'),
12-
strengths: z.array(z.string()).describe('Key strengths of the implementation'),
13-
weaknesses: z
11+
strengths: z
1412
.array(z.string())
15-
.describe('Key weaknesses or issues found'),
13+
.describe('Key strengths of the implementation'),
14+
weaknesses: z.array(z.string()).describe('Key weaknesses or issues found'),
1615
completionScore: z
1716
.number()
1817
.min(0)
1918
.max(10)
20-
.describe('How completely the spec was implemented'),
19+
.describe('How completely the prompt was addressed'),
2120
codeQualityScore: z
2221
.number()
2322
.min(0)
@@ -42,7 +41,8 @@ const judgeAgent: AgentDefinition = {
4241
properties: {
4342
analysis: {
4443
type: 'string',
45-
description: 'Detailed analysis comparing agent changes to ground truth',
44+
description:
45+
'Detailed analysis comparing agent changes to ground truth',
4646
},
4747
strengths: {
4848
type: 'array',
@@ -58,7 +58,7 @@ const judgeAgent: AgentDefinition = {
5858
type: 'number',
5959
minimum: 0,
6060
maximum: 10,
61-
description: 'How completely the spec was implemented',
61+
description: 'How completely the prompt was addressed',
6262
},
6363
codeQualityScore: {
6464
type: 'number',
@@ -82,60 +82,84 @@ const judgeAgent: AgentDefinition = {
8282
'overallScore',
8383
],
8484
},
85-
systemPrompt: `You are an expert software engineer evaluating AI-generated code changes.
85+
systemPrompt: `You are an expert software engineer evaluating AI-generated code changes with empathy for the task given.
8686
8787
## Your Role
8888
8989
You will receive:
90-
1. A spec describing what changes should be made
91-
2. The ground truth changes (expected)
92-
3. The agent's actual changes
90+
1. The user prompt that the coding agent was given
91+
2. Context files from the codebase
92+
3. The ground truth changes (expected outcome)
93+
4. The agent's actual changes
94+
95+
## Evaluation Philosophy
96+
97+
**Judge based on what the agent was asked to do, not on perfection.**
98+
99+
- If the prompt is vague or high-level (e.g., "add authentication"), be lenient and accept any reasonable implementation that achieves the goal
100+
- If the prompt is specific and detailed, expect the implementation to match those details more closely
101+
- Focus on whether the agent understood and addressed the user's intent
102+
- Consider that there are often multiple valid ways to implement the same feature
93103
94104
## Evaluation Criteria
95105
96-
- **Completion** (0-10): How completely was the spec implemented?
106+
- **Completion** (0-10): How well did the agent address what was asked in the prompt? Consider the specificity of the prompt.
97107
- **Code Quality** (0-10): How well-structured and maintainable is the code?
98-
- **Overall** (0-10): Combined quality assessment
108+
- **Overall** (0-10): Combined assessment of whether the agent successfully completed the task as requested
109+
110+
## Ground Truth
99111
100-
Focus on behavioral equivalence - the implementation doesn't need to be identical to ground truth, but should achieve the same outcome. Valid alternative approaches are acceptable.
112+
The ground truth shows ONE valid implementation, but it's not the only correct answer. The agent's implementation should be judged on:
113+
- Does it achieve the same functional outcome?
114+
- Is it a reasonable approach given the prompt?
115+
- Does it maintain code quality?
101116
102117
Provide detailed analysis, strengths, weaknesses, and numerical scores.`,
103118
}
104119

105120
interface JudgeCommitResultInput {
106121
client: CodebuffClient
107-
spec: string
108-
groundTruthFileStates: FileState[]
122+
prompt: string
123+
groundTruthFileDiffs: FileDiff[]
124+
contextFiles: Record<string, string>
109125
agentDiff: string
110126
error?: string
111127
}
112128

113129
export async function judgeCommitResult(
114130
input: JudgeCommitResultInput,
115131
): Promise<JudgingResult> {
116-
const { client, spec, groundTruthFileStates, agentDiff, error } = input
117-
118-
const groundTruthDiffs = groundTruthFileStates
119-
.map(({ path, preContent, postContent }) => {
120-
const diff = createTwoFilesPatch(
121-
path,
122-
path,
123-
preContent,
124-
postContent,
125-
'before',
126-
'after',
127-
)
132+
const {
133+
client,
134+
prompt,
135+
groundTruthFileDiffs,
136+
contextFiles,
137+
agentDiff,
138+
error,
139+
} = input
140+
141+
const groundTruthDiffs = groundTruthFileDiffs
142+
.map(({ path, diff }) => {
128143
return `### ${path}\n\`\`\`diff\n${diff}\n\`\`\``
129144
})
130145
.join('\n\n')
131146

132-
const judgePrompt = `## Task Specification
133-
${spec}
147+
const contextFilesContent = Object.entries(contextFiles)
148+
.map(([filePath, content]) => {
149+
return `### ${filePath}\n\`\`\`\n${content}\n\`\`\``
150+
})
151+
.join('\n\n')
152+
153+
const judgePrompt = `## User Prompt (What the agent was asked to do)
154+
${prompt}
155+
156+
## Context Files (from parent commit)
157+
${contextFilesContent || '(No context files)'}
134158
135-
## Ground Truth Changes (Expected)
159+
## Ground Truth Changes (One valid implementation)
136160
${groundTruthDiffs}
137161
138-
## Agent's Changes (Actual)
162+
## Agent's Changes (What the agent actually did)
139163
\`\`\`diff
140164
${agentDiff || '(No changes made)'}
141165
\`\`\`

evals/git-evals2/run-git-evals2.ts

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import { execSync } from 'child_process'
12
import fs from 'fs'
23
import path from 'path'
34

@@ -8,7 +9,7 @@ import { CodebuffClient } from '../../sdk/src/client'
89
import { runAgentOnCommit } from './agent-runner'
910
import { judgeCommitResult } from './judge'
1011
import { analyzeAgentTraces, type AgentTraceData } from './trace-analyzer'
11-
import { AgentEvalResults, EvalData, ProgressEvent } from './types'
12+
import { AgentEvalResults, EvalDataV2, ProgressEvent } from './types'
1213

1314
export async function runGitEvals2(options: {
1415
evalDataPath: string
@@ -24,7 +25,9 @@ export async function runGitEvals2(options: {
2425
}> {
2526
const { evalDataPath, agents, outputPath, limit, onProgress } = options
2627

27-
const evalData: EvalData = JSON.parse(fs.readFileSync(evalDataPath, 'utf-8'))
28+
const evalData: EvalDataV2 = JSON.parse(
29+
fs.readFileSync(evalDataPath, 'utf-8'),
30+
)
2831
const commitsToRun = limit
2932
? evalData.evalCommits.slice(0, limit)
3033
: evalData.evalCommits
@@ -59,8 +62,8 @@ export async function runGitEvals2(options: {
5962
}
6063

6164
for (const commit of commitsToRun) {
62-
console.log(`\n=== Evaluating commit ${commit.sha.slice(0, 7)} ===`)
63-
console.log(`Spec: ${commit.spec.slice(0, 100)}...`)
65+
console.log(`\n=== Evaluating ${commit.id} ===`)
66+
console.log(`Prompt: ${commit.prompt.slice(0, 100)}...`)
6467

6568
// Store trace data for this commit to analyze later
6669
const commitTraces: AgentTraceData[] = []
@@ -83,8 +86,9 @@ export async function runGitEvals2(options: {
8386

8487
const judgeResult = await judgeCommitResult({
8588
client,
86-
spec: commit.spec,
87-
groundTruthFileStates: commit.fileStates,
89+
prompt: commit.prompt,
90+
groundTruthFileDiffs: commit.fileDiffs,
91+
contextFiles: agentResult.contextFiles,
8892
agentDiff: agentResult.diff,
8993
error: agentResult.error,
9094
})
@@ -100,13 +104,10 @@ export async function runGitEvals2(options: {
100104
}
101105

102106
// Save trace to logs directory
103-
const safeSpec = commit.spec
104-
.split('\n')[0]
105-
.replace(/[^a-zA-Z0-9]/g, '_')
106-
.slice(0, 20)
107+
const safeTaskId = commit.id.replace(/[^a-zA-Z0-9-]/g, '_')
107108
const safeAgentId = agentId.replace(/[^a-zA-Z0-9-]/g, '_')
108109
const safeCommitShort = commit.sha.slice(0, 7)
109-
const traceFilename = `${safeSpec}-${safeAgentId}-${safeCommitShort}.json`
110+
const traceFilename = `${safeTaskId}-${safeAgentId}-${safeCommitShort}.json`
110111
const tracePath = path.join(logsDir, traceFilename)
111112

112113
const traceData = {
@@ -178,7 +179,7 @@ export async function runGitEvals2(options: {
178179
// After all agents complete for this commit, run trace analysis
179180
if (commitTraces.length > 1) {
180181
console.log(
181-
`\n=== Analyzing agent traces for commit ${commit.sha.slice(0, 7)} ===`,
182+
`\n=== Analyzing agent traces for ${commit.id} (${commit.sha.slice(0, 7)}) ===`,
182183
)
183184
try {
184185
const analysis = await analyzeAgentTraces({
@@ -188,12 +189,9 @@ export async function runGitEvals2(options: {
188189
})
189190

190191
// Save analysis to logs directory
191-
const safeSpec = commit.spec
192-
.split('\n')[0]
193-
.replace(/[^a-zA-Z0-9]/g, '_')
194-
.slice(0, 30)
195-
const safeCommitShort = commit.sha.slice(0, 7)
196-
const analysisFilename = `${safeSpec}-ANALYSIS-${safeCommitShort}.json`
192+
const safeTaskId = commit.id.replace(/[^a-zA-Z0-9-]/g, '_')
193+
const analysisCommitShort = commit.sha.slice(0, 7)
194+
const analysisFilename = `${safeTaskId}-ANALYSIS-${analysisCommitShort}.json`
197195
const analysisPath = path.join(logsDir, analysisFilename)
198196

199197
const analysisData = {

0 commit comments

Comments
 (0)