|
| 1 | +import { execSync } from 'child_process' |
| 2 | +import * as fs from 'fs' |
| 3 | +import * as os from 'os' |
| 4 | +import * as path from 'path' |
| 5 | +import { createTwoFilesPatch } from 'diff' |
| 6 | + |
| 7 | +import { CodebuffClient } from '../../sdk/src/client' |
| 8 | +import { AgentDefinition } from '../../sdk/src' |
| 9 | +import { getUserCredentials } from '@codebuff/npm-app/credentials' |
| 10 | +import { API_KEY_ENV_VAR } from '@codebuff/common/old-constants' |
| 11 | +import implementationPlannerAgent from '../../.agents/implementation-planner/implementation-planner' |
| 12 | + |
| 13 | +/** |
| 14 | + * Helper function to manage test repository lifecycle |
| 15 | + * Sets up a test repo, runs a function with the repo cwd, then cleans up |
| 16 | + */ |
| 17 | +export const withTestRepo = async <T>( |
| 18 | + repoConfig: { |
| 19 | + repoUrl: string |
| 20 | + commitSha: string |
| 21 | + initCommand?: string |
| 22 | + }, |
| 23 | + fn: (cwd: string) => Promise<T>, |
| 24 | +): Promise<T> => { |
| 25 | + const { repoUrl, commitSha, initCommand } = repoConfig |
| 26 | + |
| 27 | + // Create a temporary directory for the test repo |
| 28 | + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codebuff-eval-')) |
| 29 | + const repoDir = path.join(tempDir, 'repo') |
| 30 | + |
| 31 | + try { |
| 32 | + // Clone the repository |
| 33 | + console.log(`Cloning repository ${repoUrl} to ${repoDir}...`) |
| 34 | + execSync(`git clone ${repoUrl} ${repoDir}`, { stdio: 'ignore' }) |
| 35 | + |
| 36 | + // Checkout the specific commit |
| 37 | + console.log(`Checking out commit ${commitSha}...`) |
| 38 | + execSync(`git checkout ${commitSha}`, { cwd: repoDir, stdio: 'ignore' }) |
| 39 | + |
| 40 | + // Run initialization command if provided |
| 41 | + if (initCommand) { |
| 42 | + console.log(`Running init command: ${initCommand}...`) |
| 43 | + execSync(initCommand, { cwd: repoDir, stdio: 'ignore' }) |
| 44 | + } |
| 45 | + |
| 46 | + // Run the provided function with the repo directory |
| 47 | + return await fn(repoDir) |
| 48 | + } finally { |
| 49 | + // Clean up the temporary directory |
| 50 | + console.log(`Cleaning up temporary directory ${tempDir}...`) |
| 51 | + try { |
| 52 | + fs.rmSync(tempDir, { recursive: true, force: true }) |
| 53 | + } catch (error) { |
| 54 | + console.warn(`Failed to clean up temporary directory: ${error}`) |
| 55 | + } |
| 56 | + } |
| 57 | +} |
| 58 | + |
| 59 | +export const evalPlannerAgent = async (params: { |
| 60 | + spec: string |
| 61 | + repoUrl: string |
| 62 | + commitSha: string |
| 63 | + initCommand?: string |
| 64 | + fileStates: Array<{ |
| 65 | + path: string |
| 66 | + preContent: string |
| 67 | + postContent: string |
| 68 | + }> |
| 69 | +}) => { |
| 70 | + const { spec, repoUrl, commitSha, initCommand, fileStates } = params |
| 71 | + const getLocalAuthToken = () => { |
| 72 | + return getUserCredentials()?.authToken |
| 73 | + } |
| 74 | + const client = new CodebuffClient({ |
| 75 | + apiKey: process.env[API_KEY_ENV_VAR] || getLocalAuthToken(), |
| 76 | + }) |
| 77 | + |
| 78 | + const result = await withTestRepo( |
| 79 | + { repoUrl, commitSha, initCommand }, |
| 80 | + async (cwd) => { |
| 81 | + // Run the agent with the test repository as cwd |
| 82 | + console.log( |
| 83 | + `Running agent ${implementationPlannerAgent.id} with prompt: ${spec}...`, |
| 84 | + ) |
| 85 | + return await client.run({ |
| 86 | + agent: implementationPlannerAgent.id, |
| 87 | + prompt: `Please plan a full implementation of the following spec: ${spec}`, |
| 88 | + cwd, |
| 89 | + agentDefinitions: [implementationPlannerAgent], |
| 90 | + handleEvent: (event) => { |
| 91 | + console.log('Codebuff Event', JSON.stringify(event, null, 2)) |
| 92 | + }, |
| 93 | + }) |
| 94 | + }, |
| 95 | + ) |
| 96 | + |
| 97 | + const { output } = result |
| 98 | + |
| 99 | + const outputString = JSON.stringify( |
| 100 | + 'value' in output ? output.value : output.message, |
| 101 | + ) |
| 102 | + |
| 103 | + // Compute file changes and diffs |
| 104 | + const fileChangesSection = fileStates |
| 105 | + .map(({ path, preContent, postContent }) => { |
| 106 | + return `\n### File: ${path}\n\n<pre_content>\n${preContent}\n</pre_content>\n\n<post_content>\n${postContent}\n</post_content>` |
| 107 | + }) |
| 108 | + .join('\n') |
| 109 | + |
| 110 | + const diffsSection = fileStates |
| 111 | + .map(({ path, preContent, postContent }) => { |
| 112 | + const diff = createTwoFilesPatch( |
| 113 | + path, |
| 114 | + path, |
| 115 | + preContent, |
| 116 | + postContent, |
| 117 | + 'before', |
| 118 | + 'after', |
| 119 | + ) |
| 120 | + return `\n### Diff for ${path}:\n\`\`\`diff\n${diff}\n\`\`\`` |
| 121 | + }) |
| 122 | + .join('\n') |
| 123 | + |
| 124 | + // Build the judge prompt |
| 125 | + const judgePrompt = `# Implementation Plan Evaluation |
| 126 | +
|
| 127 | +## Task Specification |
| 128 | +
|
| 129 | +The agent was given the following spec to create an implementation plan: |
| 130 | +
|
| 131 | +<spec> |
| 132 | +${spec} |
| 133 | +</spec> |
| 134 | +
|
| 135 | +## Agent's Implementation Plan |
| 136 | +
|
| 137 | +<agent_output> |
| 138 | +${outputString} |
| 139 | +</agent_output> |
| 140 | +
|
| 141 | +## Expected Changes from Actual Commit |
| 142 | +
|
| 143 | +### File Changes |
| 144 | +<expected_changes>${fileChangesSection} |
| 145 | +</expected_changes> |
| 146 | +
|
| 147 | +### Expected Diffs |
| 148 | +<expected_diffs>${diffsSection} |
| 149 | +</expected_diffs> |
| 150 | +
|
| 151 | +## Your Task |
| 152 | +
|
| 153 | +Evaluate how well the implementation plan matches the real commit changes. Consider: |
| 154 | +- Coverage of key changes from the commit |
| 155 | +- Appropriateness and correctness of proposed code changes |
| 156 | +- Whether following the plan would achieve the same (or better) behavior |
| 157 | +- Any missing critical changes |
| 158 | +- Any unnecessary proposed changes` |
| 159 | + |
| 160 | + const judgeResult = await client.run({ |
| 161 | + agent: 'eval-judge', |
| 162 | + prompt: judgePrompt, |
| 163 | + agentDefinitions: [judgeAgent], |
| 164 | + }) |
| 165 | + if (judgeResult.output.type !== 'structuredOutput') { |
| 166 | + throw new Error('Error running judge agent') |
| 167 | + } |
| 168 | + const { output: judgeOutput } = judgeResult |
| 169 | + const judgingResults = judgeOutput.value ?? {} |
| 170 | + |
| 171 | + return { judgingResults, agentOutput: outputString } |
| 172 | +} |
| 173 | + |
| 174 | +const judgeAgent: AgentDefinition = { |
| 175 | + id: 'eval-judge', |
| 176 | + displayName: 'Eval Judge', |
| 177 | + model: 'x-ai/grok-4-fast:free', |
| 178 | + toolNames: ['set_output'], |
| 179 | + inputSchema: { |
| 180 | + prompt: { type: 'string', description: 'The prompt to judge' }, |
| 181 | + }, |
| 182 | + outputMode: 'structured_output', |
| 183 | + outputSchema: { |
| 184 | + type: 'object', |
| 185 | + properties: { |
| 186 | + reasoning: { type: 'string' }, |
| 187 | + pros: { type: 'string' }, |
| 188 | + cons: { type: 'string' }, |
| 189 | + overallScore: { |
| 190 | + type: 'number', |
| 191 | + description: 'A score between 0 and 100, where 100 is the best score', |
| 192 | + }, |
| 193 | + }, |
| 194 | + required: ['reasoning', 'pros', 'cons', 'overallScore'], |
| 195 | + }, |
| 196 | + systemPrompt: `You are an expert judge evaluating implementation plans created by AI agents. |
| 197 | +
|
| 198 | +## Context |
| 199 | +
|
| 200 | +You will receive: |
| 201 | +1. A spec describing what changes should be made |
| 202 | +2. An implementation plan created by an agent based on that spec |
| 203 | +3. The actual file changes and diffs from a real git commit |
| 204 | +
|
| 205 | +## Your Role |
| 206 | +
|
| 207 | +Grade how well the implementation plan matches the actual implementation. The plan doesn't need to be identical - slight differences are acceptable if the behavior would be equivalent. Sometimes the plan might even propose improvements over the actual commit. |
| 208 | +
|
| 209 | +## Evaluation Criteria |
| 210 | +
|
| 211 | +- **Coverage**: Does the plan address all key changes from the commit? |
| 212 | +- **Correctness**: Are the proposed code changes appropriate and accurate? |
| 213 | +- **Behavioral equivalence**: Would following the plan achieve the same outcome? |
| 214 | +- **Completeness**: Are any critical changes missing? |
| 215 | +- **Efficiency**: Does it avoid unnecessary changes?`, |
| 216 | +} |
| 217 | + |
| 218 | +type EvalData = { |
| 219 | + repoUrl: string |
| 220 | + initCommand?: string |
| 221 | + evalCommits: Array<{ |
| 222 | + sha: string |
| 223 | + spec: string |
| 224 | + fileStates: Array<{ |
| 225 | + path: string |
| 226 | + preContent: string |
| 227 | + postContent: string |
| 228 | + }> |
| 229 | + }> |
| 230 | +} |
| 231 | + |
| 232 | +async function main() { |
| 233 | + // Load the eval file |
| 234 | + const evalFilePath = path.join( |
| 235 | + __dirname, |
| 236 | + '..', |
| 237 | + 'git-evals', |
| 238 | + 'eval-codebuff2.json', |
| 239 | + ) |
| 240 | + const evalData: EvalData = JSON.parse(fs.readFileSync(evalFilePath, 'utf-8')) |
| 241 | + |
| 242 | + const { repoUrl, initCommand, evalCommits } = evalData |
| 243 | + |
| 244 | + // Loop through each eval task |
| 245 | + for (const evalCommit of evalCommits) { |
| 246 | + const { sha, spec, fileStates } = evalCommit |
| 247 | + |
| 248 | + console.log(`\n=== Running eval for commit ${sha} ===`) |
| 249 | + console.log(`Spec: ${spec.substring(0, 100)}...\n`) |
| 250 | + |
| 251 | + try { |
| 252 | + const result = await evalPlannerAgent({ |
| 253 | + spec, |
| 254 | + repoUrl, |
| 255 | + commitSha: sha, |
| 256 | + initCommand, |
| 257 | + fileStates, |
| 258 | + }) |
| 259 | + |
| 260 | + const { judgingResults } = result |
| 261 | + const { reasoning, pros, cons, overallScore } = judgingResults |
| 262 | + |
| 263 | + console.log(`\n${'='.repeat(80)}`) |
| 264 | + console.log(`✓ Eval completed for commit ${sha}`) |
| 265 | + console.log(`${'='.repeat(80)}\n`) |
| 266 | + |
| 267 | + console.log('📊 EVALUATION RESULTS') |
| 268 | + console.log('─'.repeat(80)) |
| 269 | + |
| 270 | + if (reasoning) { |
| 271 | + console.log('\n🧠 REASONING:') |
| 272 | + console.log(reasoning) |
| 273 | + } |
| 274 | + |
| 275 | + if (pros) { |
| 276 | + console.log('\n✅ PROS:') |
| 277 | + console.log(pros) |
| 278 | + } |
| 279 | + |
| 280 | + if (cons) { |
| 281 | + console.log('\n❌ CONS:') |
| 282 | + console.log(cons) |
| 283 | + } |
| 284 | + |
| 285 | + if (typeof overallScore === 'number') { |
| 286 | + console.log('\n📈 OVERALL SCORE:') |
| 287 | + const scoreBar = '█'.repeat(Math.floor(overallScore / 10)) |
| 288 | + const emptyBar = '░'.repeat(10 - Math.floor(overallScore / 10)) |
| 289 | + console.log(`${scoreBar}${emptyBar} ${overallScore}/100`) |
| 290 | + } |
| 291 | + |
| 292 | + console.log('\n' + '='.repeat(80) + '\n') |
| 293 | + } catch (error) { |
| 294 | + console.log(`\n${'='.repeat(80)}`) |
| 295 | + console.error(`✗ Failed eval for commit ${sha}`) |
| 296 | + console.log(`${'='.repeat(80)}\n`) |
| 297 | + console.error('Error details:', error) |
| 298 | + console.log('\n' + '='.repeat(80) + '\n') |
| 299 | + } |
| 300 | + |
| 301 | + console.log('breaking for now') |
| 302 | + break |
| 303 | + } |
| 304 | + |
| 305 | + console.log('\n=== All evals completed ===') |
| 306 | +} |
| 307 | + |
| 308 | +// Run main if this file is executed directly |
| 309 | +if (import.meta.main) { |
| 310 | + main().catch((error) => { |
| 311 | + console.error('Fatal error:', error) |
| 312 | + process.exit(1) |
| 313 | + }) |
| 314 | +} |
0 commit comments