Skip to content

Commit 5c5e2f0

Browse files
committed
Eval planner
1 parent 929d112 commit 5c5e2f0

File tree

4 files changed

+321
-3
lines changed

4 files changed

+321
-3
lines changed

backend/src/tools/prompts.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -244,8 +244,8 @@ ${getToolCallString(
244244
false,
245245
)}
246246
247-
${toolDescriptions.join('\n\n')}
248-
249247
Important: You only have access to the tools below. Do not use any other tools -- they are not available to you, instead they may have been previously used by other agents.
248+
249+
${toolDescriptions.join('\n\n')}
250250
`.trim()
251251
}

bun.lock

Lines changed: 4 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

evals/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
"@oclif/core": "^4.4.0",
3939
"@oclif/parser": "^3.8.17",
4040
"async": "^3.2.6",
41+
"diff": "^8.0.2",
4142
"lodash": "^4.17.21",
4243
"p-limit": "^6.2.0",
4344
"zod": "3.25.67"

evals/subagents/eval-planner.ts

Lines changed: 314 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,314 @@
1+
import { execSync } from 'child_process'
2+
import * as fs from 'fs'
3+
import * as os from 'os'
4+
import * as path from 'path'
5+
import { createTwoFilesPatch } from 'diff'
6+
7+
import { CodebuffClient } from '../../sdk/src/client'
8+
import { AgentDefinition } from '../../sdk/src'
9+
import { getUserCredentials } from '@codebuff/npm-app/credentials'
10+
import { API_KEY_ENV_VAR } from '@codebuff/common/old-constants'
11+
import implementationPlannerAgent from '../../.agents/implementation-planner/implementation-planner'
12+
13+
/**
14+
* Helper function to manage test repository lifecycle
15+
* Sets up a test repo, runs a function with the repo cwd, then cleans up
16+
*/
17+
export const withTestRepo = async <T>(
18+
repoConfig: {
19+
repoUrl: string
20+
commitSha: string
21+
initCommand?: string
22+
},
23+
fn: (cwd: string) => Promise<T>,
24+
): Promise<T> => {
25+
const { repoUrl, commitSha, initCommand } = repoConfig
26+
27+
// Create a temporary directory for the test repo
28+
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codebuff-eval-'))
29+
const repoDir = path.join(tempDir, 'repo')
30+
31+
try {
32+
// Clone the repository
33+
console.log(`Cloning repository ${repoUrl} to ${repoDir}...`)
34+
execSync(`git clone ${repoUrl} ${repoDir}`, { stdio: 'ignore' })
35+
36+
// Checkout the specific commit
37+
console.log(`Checking out commit ${commitSha}...`)
38+
execSync(`git checkout ${commitSha}`, { cwd: repoDir, stdio: 'ignore' })
39+
40+
// Run initialization command if provided
41+
if (initCommand) {
42+
console.log(`Running init command: ${initCommand}...`)
43+
execSync(initCommand, { cwd: repoDir, stdio: 'ignore' })
44+
}
45+
46+
// Run the provided function with the repo directory
47+
return await fn(repoDir)
48+
} finally {
49+
// Clean up the temporary directory
50+
console.log(`Cleaning up temporary directory ${tempDir}...`)
51+
try {
52+
fs.rmSync(tempDir, { recursive: true, force: true })
53+
} catch (error) {
54+
console.warn(`Failed to clean up temporary directory: ${error}`)
55+
}
56+
}
57+
}
58+
59+
export const evalPlannerAgent = async (params: {
60+
spec: string
61+
repoUrl: string
62+
commitSha: string
63+
initCommand?: string
64+
fileStates: Array<{
65+
path: string
66+
preContent: string
67+
postContent: string
68+
}>
69+
}) => {
70+
const { spec, repoUrl, commitSha, initCommand, fileStates } = params
71+
const getLocalAuthToken = () => {
72+
return getUserCredentials()?.authToken
73+
}
74+
const client = new CodebuffClient({
75+
apiKey: process.env[API_KEY_ENV_VAR] || getLocalAuthToken(),
76+
})
77+
78+
const result = await withTestRepo(
79+
{ repoUrl, commitSha, initCommand },
80+
async (cwd) => {
81+
// Run the agent with the test repository as cwd
82+
console.log(
83+
`Running agent ${implementationPlannerAgent.id} with prompt: ${spec}...`,
84+
)
85+
return await client.run({
86+
agent: implementationPlannerAgent.id,
87+
prompt: `Please plan a full implementation of the following spec: ${spec}`,
88+
cwd,
89+
agentDefinitions: [implementationPlannerAgent],
90+
handleEvent: (event) => {
91+
console.log('Codebuff Event', JSON.stringify(event, null, 2))
92+
},
93+
})
94+
},
95+
)
96+
97+
const { output } = result
98+
99+
const outputString = JSON.stringify(
100+
'value' in output ? output.value : output.message,
101+
)
102+
103+
// Compute file changes and diffs
104+
const fileChangesSection = fileStates
105+
.map(({ path, preContent, postContent }) => {
106+
return `\n### File: ${path}\n\n<pre_content>\n${preContent}\n</pre_content>\n\n<post_content>\n${postContent}\n</post_content>`
107+
})
108+
.join('\n')
109+
110+
const diffsSection = fileStates
111+
.map(({ path, preContent, postContent }) => {
112+
const diff = createTwoFilesPatch(
113+
path,
114+
path,
115+
preContent,
116+
postContent,
117+
'before',
118+
'after',
119+
)
120+
return `\n### Diff for ${path}:\n\`\`\`diff\n${diff}\n\`\`\``
121+
})
122+
.join('\n')
123+
124+
// Build the judge prompt
125+
const judgePrompt = `# Implementation Plan Evaluation
126+
127+
## Task Specification
128+
129+
The agent was given the following spec to create an implementation plan:
130+
131+
<spec>
132+
${spec}
133+
</spec>
134+
135+
## Agent's Implementation Plan
136+
137+
<agent_output>
138+
${outputString}
139+
</agent_output>
140+
141+
## Expected Changes from Actual Commit
142+
143+
### File Changes
144+
<expected_changes>${fileChangesSection}
145+
</expected_changes>
146+
147+
### Expected Diffs
148+
<expected_diffs>${diffsSection}
149+
</expected_diffs>
150+
151+
## Your Task
152+
153+
Evaluate how well the implementation plan matches the real commit changes. Consider:
154+
- Coverage of key changes from the commit
155+
- Appropriateness and correctness of proposed code changes
156+
- Whether following the plan would achieve the same (or better) behavior
157+
- Any missing critical changes
158+
- Any unnecessary proposed changes`
159+
160+
const judgeResult = await client.run({
161+
agent: 'eval-judge',
162+
prompt: judgePrompt,
163+
agentDefinitions: [judgeAgent],
164+
})
165+
if (judgeResult.output.type !== 'structuredOutput') {
166+
throw new Error('Error running judge agent')
167+
}
168+
const { output: judgeOutput } = judgeResult
169+
const judgingResults = judgeOutput.value ?? {}
170+
171+
return { judgingResults, agentOutput: outputString }
172+
}
173+
174+
const judgeAgent: AgentDefinition = {
175+
id: 'eval-judge',
176+
displayName: 'Eval Judge',
177+
model: 'x-ai/grok-4-fast:free',
178+
toolNames: ['set_output'],
179+
inputSchema: {
180+
prompt: { type: 'string', description: 'The prompt to judge' },
181+
},
182+
outputMode: 'structured_output',
183+
outputSchema: {
184+
type: 'object',
185+
properties: {
186+
reasoning: { type: 'string' },
187+
pros: { type: 'string' },
188+
cons: { type: 'string' },
189+
overallScore: {
190+
type: 'number',
191+
description: 'A score between 0 and 100, where 100 is the best score',
192+
},
193+
},
194+
required: ['reasoning', 'pros', 'cons', 'overallScore'],
195+
},
196+
systemPrompt: `You are an expert judge evaluating implementation plans created by AI agents.
197+
198+
## Context
199+
200+
You will receive:
201+
1. A spec describing what changes should be made
202+
2. An implementation plan created by an agent based on that spec
203+
3. The actual file changes and diffs from a real git commit
204+
205+
## Your Role
206+
207+
Grade how well the implementation plan matches the actual implementation. The plan doesn't need to be identical - slight differences are acceptable if the behavior would be equivalent. Sometimes the plan might even propose improvements over the actual commit.
208+
209+
## Evaluation Criteria
210+
211+
- **Coverage**: Does the plan address all key changes from the commit?
212+
- **Correctness**: Are the proposed code changes appropriate and accurate?
213+
- **Behavioral equivalence**: Would following the plan achieve the same outcome?
214+
- **Completeness**: Are any critical changes missing?
215+
- **Efficiency**: Does it avoid unnecessary changes?`,
216+
}
217+
218+
type EvalData = {
219+
repoUrl: string
220+
initCommand?: string
221+
evalCommits: Array<{
222+
sha: string
223+
spec: string
224+
fileStates: Array<{
225+
path: string
226+
preContent: string
227+
postContent: string
228+
}>
229+
}>
230+
}
231+
232+
async function main() {
233+
// Load the eval file
234+
const evalFilePath = path.join(
235+
__dirname,
236+
'..',
237+
'git-evals',
238+
'eval-codebuff2.json',
239+
)
240+
const evalData: EvalData = JSON.parse(fs.readFileSync(evalFilePath, 'utf-8'))
241+
242+
const { repoUrl, initCommand, evalCommits } = evalData
243+
244+
// Loop through each eval task
245+
for (const evalCommit of evalCommits) {
246+
const { sha, spec, fileStates } = evalCommit
247+
248+
console.log(`\n=== Running eval for commit ${sha} ===`)
249+
console.log(`Spec: ${spec.substring(0, 100)}...\n`)
250+
251+
try {
252+
const result = await evalPlannerAgent({
253+
spec,
254+
repoUrl,
255+
commitSha: sha,
256+
initCommand,
257+
fileStates,
258+
})
259+
260+
const { judgingResults } = result
261+
const { reasoning, pros, cons, overallScore } = judgingResults
262+
263+
console.log(`\n${'='.repeat(80)}`)
264+
console.log(`✓ Eval completed for commit ${sha}`)
265+
console.log(`${'='.repeat(80)}\n`)
266+
267+
console.log('📊 EVALUATION RESULTS')
268+
console.log('─'.repeat(80))
269+
270+
if (reasoning) {
271+
console.log('\n🧠 REASONING:')
272+
console.log(reasoning)
273+
}
274+
275+
if (pros) {
276+
console.log('\n✅ PROS:')
277+
console.log(pros)
278+
}
279+
280+
if (cons) {
281+
console.log('\n❌ CONS:')
282+
console.log(cons)
283+
}
284+
285+
if (typeof overallScore === 'number') {
286+
console.log('\n📈 OVERALL SCORE:')
287+
const scoreBar = '█'.repeat(Math.floor(overallScore / 10))
288+
const emptyBar = '░'.repeat(10 - Math.floor(overallScore / 10))
289+
console.log(`${scoreBar}${emptyBar} ${overallScore}/100`)
290+
}
291+
292+
console.log('\n' + '='.repeat(80) + '\n')
293+
} catch (error) {
294+
console.log(`\n${'='.repeat(80)}`)
295+
console.error(`✗ Failed eval for commit ${sha}`)
296+
console.log(`${'='.repeat(80)}\n`)
297+
console.error('Error details:', error)
298+
console.log('\n' + '='.repeat(80) + '\n')
299+
}
300+
301+
console.log('breaking for now')
302+
break
303+
}
304+
305+
console.log('\n=== All evals completed ===')
306+
}
307+
308+
// Run main if this file is executed directly
309+
if (import.meta.main) {
310+
main().catch((error) => {
311+
console.error('Fatal error:', error)
312+
process.exit(1)
313+
})
314+
}

0 commit comments

Comments
 (0)