Skip to content

Commit 5dc640f

Browse files
committed
run eval set from a git worktree — so it freezes a copy of the code even when run locally
1 parent 41e0346 commit 5dc640f

File tree

2 files changed

+109
-18
lines changed

2 files changed

+109
-18
lines changed

evals/git-evals/run-eval-set.ts

Lines changed: 82 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
#!/usr/bin/env bun
22

3+
import { execFileSync } from 'child_process'
34
import path from 'path'
45

6+
import { generateCompactId } from '@codebuff/common/util/string'
57
import { Command, Flags } from '@oclif/core'
68

79
import { sendEvalResultsEmail } from './email-eval-results'
@@ -92,6 +94,70 @@ class RunEvalSetCommand extends Command {
9294
}
9395
}
9496

97+
/**
98+
* Creates a git worktree for the current commit to isolate code version
99+
*/
100+
function createEvalWorktree(): string {
101+
const currentCommit = execFileSync('git', ['rev-parse', 'HEAD'], {
102+
encoding: 'utf-8',
103+
}).trim()
104+
105+
const worktreeId = generateCompactId()
106+
// Get project root by going up from the evals/git-evals directory
107+
const projectRoot = path.resolve(__dirname, '../..')
108+
const worktreePath = path.resolve(
109+
projectRoot,
110+
'..',
111+
`codebuff-eval-worktree-${worktreeId}`,
112+
)
113+
114+
console.log(`Creating eval worktree at ${worktreePath}...`)
115+
console.log(`Commit: ${currentCommit}`)
116+
117+
try {
118+
execFileSync('git', ['worktree', 'add', worktreePath, currentCommit], {
119+
stdio: 'inherit',
120+
})
121+
console.log('✅ Worktree created successfully')
122+
123+
// Install dependencies in worktree to ensure node_modules are in sync
124+
console.log('Installing dependencies in worktree...')
125+
execFileSync('bun', ['install'], {
126+
cwd: worktreePath,
127+
stdio: 'inherit',
128+
})
129+
console.log('✅ Dependencies installed successfully')
130+
131+
return worktreePath
132+
} catch (error) {
133+
console.error('Failed to create worktree:', error)
134+
throw error
135+
}
136+
}
137+
138+
/**
139+
* Removes the eval worktree
140+
*/
141+
function cleanupEvalWorktree(worktreePath: string): void {
142+
console.log(`\nCleaning up eval worktree at ${worktreePath}...`)
143+
144+
try {
145+
// Remove the worktree
146+
execFileSync('git', ['worktree', 'remove', worktreePath, '--force'], {
147+
stdio: 'inherit',
148+
})
149+
console.log('✅ Worktree removed successfully')
150+
} catch (error) {
151+
console.error('Failed to remove worktree:', error)
152+
// Try to prune if remove failed
153+
try {
154+
execFileSync('git', ['worktree', 'prune'], { stdio: 'inherit' })
155+
} catch (pruneError) {
156+
console.error('Failed to prune worktrees:', pruneError)
157+
}
158+
}
159+
}
160+
95161
async function runEvalSet(options: {
96162
sets: string
97163
'output-dir': string
@@ -124,10 +190,14 @@ async function runEvalSet(options: {
124190
console.log('Starting eval set run...')
125191
console.log(`Output directory: ${outputDir}`)
126192

127-
// Set up signal handlers to clean up child processes
193+
// Create worktree to freeze code version for this eval run
194+
const worktreePath = createEvalWorktree()
195+
196+
// Set up signal handlers to clean up child processes and worktree
128197
const signalHandler = async (signal: string) => {
129198
console.log(`\nReceived ${signal}, cleaning up evaluation processes...`)
130199
await terminateAllEvalChildren()
200+
cleanupEvalWorktree(worktreePath)
131201
console.log('Cleanup complete.')
132202
process.exit(signal === 'SIGINT' ? 130 : 143)
133203
}
@@ -151,25 +221,28 @@ async function runEvalSet(options: {
151221
)
152222
}
153223

224+
// Resolve paths relative to worktree if using one
225+
const baseDir = path.join(worktreePath, 'evals', 'git-evals')
226+
154227
const allEvalConfigs: EvalConfig[] = [
155228
{
156229
name: 'codebuff',
157-
evalDataPath: path.join(__dirname, 'eval-codebuff2.json'),
230+
evalDataPath: path.join(baseDir, 'eval-codebuff2.json'),
158231
outputDir,
159232
},
160233
{
161234
name: 'manifold',
162-
evalDataPath: path.join(__dirname, 'eval-manifold2.json'),
235+
evalDataPath: path.join(baseDir, 'eval-manifold2.json'),
163236
outputDir,
164237
},
165238
{
166239
name: 'plane',
167-
evalDataPath: path.join(__dirname, 'eval-plane.json'),
240+
evalDataPath: path.join(baseDir, 'eval-plane.json'),
168241
outputDir,
169242
},
170243
{
171244
name: 'saleor',
172-
evalDataPath: path.join(__dirname, 'eval-saleor.json'),
245+
evalDataPath: path.join(baseDir, 'eval-saleor.json'),
173246
outputDir,
174247
},
175248
]
@@ -204,6 +277,7 @@ async function runEvalSet(options: {
204277
config.limit,
205278
options.concurrency === 1,
206279
agent,
280+
worktreePath,
207281
)
208282
} catch (error) {
209283
const evalDuration = Date.now() - evalStartTime
@@ -447,6 +521,9 @@ async function runEvalSet(options: {
447521
}
448522
}
449523

524+
// Clean up worktree before exiting
525+
cleanupEvalWorktree(worktreePath)
526+
450527
if (failureCount > 0) {
451528
console.log(
452529
'\n⚠️ Some evaluations failed. Check the logs above for details.',

evals/git-evals/run-git-evals.ts

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -149,17 +149,17 @@ You must decide whether to:
149149
150150
If deciding to continue, include a clear, focused prompt for Codebuff in next_prompt. Note that Codebuff does not have access to the spec, so you must describe the changes you want Codebuff to make in a way that is clear and concise.
151151
Explain your reasoning in detail. Do not ask Codebuff to git commit changes.`,
152-
},
153-
],
154-
schema: AgentDecisionSchema,
155-
model: 'x-ai/grok-4-fast',
156-
clientSessionId,
157-
fingerprintId,
158-
userInputId: generateCompactId(),
159-
userId: undefined,
160-
timeout: 5 * 60_000, // 5 minute timeout
161-
logger: console,
162-
})
152+
},
153+
],
154+
schema: AgentDecisionSchema,
155+
model: 'x-ai/grok-4-fast',
156+
clientSessionId,
157+
fingerprintId,
158+
userInputId: generateCompactId(),
159+
userId: undefined,
160+
timeout: 5 * 60_000, // 5 minute timeout
161+
logger: console,
162+
})
163163
} catch (agentError) {
164164
throw new Error(
165165
`Agent decision failed: ${agentError instanceof Error ? `${agentError.message}\n${JSON.stringify(agentError)}\n${agentError.stack}` : String(agentError)}`,
@@ -376,6 +376,7 @@ export async function runGitEvals(
376376
limit?: number,
377377
logToStdout: boolean = false,
378378
agent: string = 'base',
379+
worktreePath?: string,
379380
): Promise<FullEvalLog> {
380381
// Set up signal handlers if this is the main module
381382
if (require.main === module) {
@@ -469,14 +470,25 @@ export async function runGitEvals(
469470
: fs.createWriteStream(logPath)
470471

471472
// Write evalCommit to temporary file to avoid long command line arguments
472-
const tempEvalCommitPath = path.join(
473+
// Use absolute path so it works from worktree too
474+
const tempEvalCommitPath = path.resolve(
473475
logsDir,
474476
`eval-commit-${evalCommit.sha.slice(0, 7)}.json`,
475477
)
476478
fs.writeFileSync(tempEvalCommitPath, JSON.stringify(evalCommit))
477479

480+
// Resolve the process script path relative to worktree if provided
481+
const processScriptPath = worktreePath
482+
? path.join(
483+
worktreePath,
484+
'evals',
485+
'git-evals',
486+
'run-single-eval-process.ts',
487+
)
488+
: path.resolve(__dirname, 'run-single-eval-process.ts')
489+
478490
const child = fork(
479-
path.resolve(__dirname, 'run-single-eval-process.ts'),
491+
processScriptPath,
480492
[
481493
tempEvalCommitPath,
482494
projectPath,
@@ -489,6 +501,8 @@ export async function runGitEvals(
489501
stdio: ['pipe', 'pipe', 'pipe', 'ipc'],
490502
env: process.env,
491503
detached: true, // Create new process group for proper signal handling
504+
// Set cwd to worktree so relative imports work correctly
505+
...(worktreePath ? { cwd: worktreePath } : {}),
492506
},
493507
)
494508

0 commit comments

Comments
 (0)