Skip to content

Commit ad9ad64

Browse files
committed
Better log judge errors
1 parent 9ee6a42 commit ad9ad64

File tree

2 files changed

+32
-16
lines changed

2 files changed

+32
-16
lines changed

evals/buffbench/judge.ts

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
import { z } from 'zod/v4'
22

3-
import type { FileDiff } from './types'
3+
import type { EvalCommitV2 } from './types'
44
import type { AgentDefinition, CodebuffClient } from '@codebuff/sdk'
55
import { withTimeout } from '@codebuff/common/util/promise'
6+
import path from 'path'
7+
import fs from 'fs'
8+
9+
const DEBUG_ERROR = true
610

711
export const JudgingResultSchema = z.object({
812
analysis: z
@@ -135,8 +139,7 @@ const judgeAgents: Record<string, AgentDefinition> = {
135139

136140
interface JudgeCommitResultInput {
137141
client: CodebuffClient
138-
prompt: string
139-
groundTruthFileDiffs: FileDiff[]
142+
commit: EvalCommitV2
140143
contextFiles: Record<string, string>
141144
agentDiff: string
142145
error?: string
@@ -177,7 +180,26 @@ async function runSingleJudge(
177180
`Judge ${judgeAgentId} - not structured output`,
178181
JSON.stringify(judgeResult.output, null, 2),
179182
)
180-
console.error('Judge agent output trace:', agentOutput.join(''))
183+
console.error(
184+
'Judge agent output:',
185+
JSON.stringify(judgeResult.output, null, 2),
186+
'Judge agent output trace:',
187+
agentOutput.join(''),
188+
)
189+
if (DEBUG_ERROR) {
190+
fs.writeFileSync(
191+
path.join(
192+
__dirname,
193+
'..',
194+
`${input.commit.id}-${judgeAgentId}-agent-output-error.json`,
195+
),
196+
JSON.stringify(
197+
{ output: judgeResult.output, trace: agentOutput },
198+
null,
199+
2,
200+
),
201+
)
202+
}
181203
return null
182204
}
183205

@@ -191,16 +213,11 @@ async function runSingleJudge(
191213
export async function judgeCommitResult(
192214
input: JudgeCommitResultInput,
193215
): Promise<JudgingResult> {
194-
const {
195-
prompt,
196-
groundTruthFileDiffs,
197-
contextFiles,
198-
agentDiff,
199-
error,
200-
finalCheckOutputs,
201-
} = input
202-
203-
const groundTruthDiffs = groundTruthFileDiffs
216+
const { commit, contextFiles, agentDiff, error, finalCheckOutputs } = input
217+
218+
const { prompt, fileDiffs } = commit
219+
220+
const groundTruthDiffs = fileDiffs
204221
.map(({ path, diff }) => {
205222
return `### ${path}\n\`\`\`diff\n${diff}\n\`\`\``
206223
})

evals/buffbench/run-buffbench.ts

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,7 @@ async function runTask(options: {
9898

9999
const judgeResult = await judgeCommitResult({
100100
client,
101-
prompt: commit.prompt,
102-
groundTruthFileDiffs: commit.fileDiffs,
101+
commit,
103102
contextFiles: agentResult.contextFiles,
104103
agentDiff: agentResult.diff,
105104
error: agentResult.error,

0 commit comments

Comments
 (0)