Skip to content

Commit 1e419d4

Browse files
committed
BuffBench nightly evals
1 parent e7a210f commit 1e419d4

File tree

3 files changed

+96
-1
lines changed

3 files changed

+96
-1
lines changed

.github/workflows/nightly-evals.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ jobs:
4747
echo "CODEBUFF_GITHUB_TOKEN=${{ secrets.CODEBUFF_GITHUB_TOKEN }}" >> $GITHUB_ENV
4848
4949
- name: Run nightly evals
50-
run: cd evals && bun run-eval-set --concurrency 10 --email --title "Nightly Eval Run ($(date '+%Y-%m-%d'))"
50+
run: cd evals && bun run-buffbench-nightly
5151

5252
- name: Workflow completed
5353
run: echo "Nightly evals workflow completed successfully"

evals/buffbench/main-nightly.ts

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
import path from 'path'
2+
3+
import { sendBasicEmail } from '@codebuff/internal/loops'
4+
5+
import { runBuffBench } from './run-buffbench'
6+
import type { AgentEvalResults } from './types'
7+
8+
async function main() {
9+
console.log('Starting nightly buffbench evaluation...')
10+
console.log('Agents: base, base2')
11+
console.log('Eval set: codebuff')
12+
console.log()
13+
14+
const results = await runBuffBench({
15+
evalDataPath: path.join(__dirname, 'eval-codebuff.json'),
16+
agents: ['base', 'base2'],
17+
taskConcurrency: 20,
18+
})
19+
20+
console.log('\nNightly buffbench evaluation completed successfully!')
21+
22+
// Send email with results
23+
const recipientEmail = process.env.EVAL_RESULTS_EMAIL || 'team@codebuff.com'
24+
console.log(`\n📧 Sending buffbench results email to ${recipientEmail}...`)
25+
26+
const { metadata, ...agentResults } = results
27+
const emailContent = formatBuffBenchEmailContent(agentResults, metadata)
28+
29+
try {
30+
const emailResult = await sendBasicEmail({
31+
email: recipientEmail,
32+
data: emailContent,
33+
logger: console,
34+
})
35+
36+
if (emailResult.success) {
37+
console.log('✅ BuffBench results email sent successfully!')
38+
} else {
39+
console.log('⚠️ Email sending was skipped (likely missing configuration)')
40+
}
41+
} catch (emailError) {
42+
console.error('❌ Failed to send buffbench results email:', emailError)
43+
}
44+
45+
process.exit(0)
46+
}
47+
48+
function formatBuffBenchEmailContent(
49+
results: Record<string, AgentEvalResults>,
50+
metadata: any,
51+
) {
52+
const agents = Object.keys(results)
53+
const date = new Date().toLocaleDateString()
54+
55+
const agentScores = agents
56+
.map((agentId) => `${agentId}: ${results[agentId].averageScore.toFixed(1)}`)
57+
.join(' | ')
58+
59+
const subject = `Nightly BuffBench Results - ${date} - ${agentScores}`
60+
61+
const agentComparison = agents
62+
.map(
63+
(agentId) =>
64+
`${agentId}:
65+
- Average Score: ${results[agentId].averageScore.toFixed(2)}/10
66+
- Average Cost: ${results[agentId].averageCost.toFixed(4)}
67+
- Average Duration: ${(results[agentId].averageDuration / 1000).toFixed(1)}s
68+
- Valid Runs: ${results[agentId].runs.length}`,
69+
)
70+
.join('\n\n')
71+
72+
const message = `📊 NIGHTLY BUFFBENCH RESULTS
73+
74+
📈 AGENT RESULTS:
75+
${agentComparison}
76+
77+
📁 Results Location: ${metadata.logsDirectory}
78+
⏱️ Total Evaluation Time: ${(metadata.totalDuration / 1000 / 60).toFixed(1)} minutes
79+
• Total Tasks: ${metadata.commitsEvaluated}
80+
• Agents Tested: ${agents.join(', ')}
81+
82+
Generated on: ${metadata.timestamp}
83+
Repository: ${metadata.repoUrl}`
84+
85+
return { subject, message }
86+
}
87+
88+
if (import.meta.main) {
89+
main().catch((error) => {
90+
console.error('Error running nightly buffbench:', error)
91+
process.exit(1)
92+
})
93+
}

evals/package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
"run-single-eval": "bun run git-evals/run-single-eval.ts --eval-file git-evals/eval-manifold.json --commit-sha ebabf7796a92ce8ece8e2452b0f3f896a513ba0e",
2222
"run-git-evals": "bun run git-evals/run-git-evals.ts",
2323
"run-eval-set": "bun run git-evals/run-eval-set.ts",
24+
"run-buffbench": "bun run buffbench/main.ts",
25+
"run-buffbench-nightly": "bun run buffbench/main-nightly.ts",
2426
"setup-codebuff-repo": "bun run setup-codebuff-repo.ts"
2527
},
2628
"sideEffects": false,

0 commit comments

Comments
 (0)