Judge based on prompt not spec, pass in context files

jahooma · jahooma · commit 4918fe525371 · 2025-10-11T21:11:02.000-07:00
diff --git a/evals/git-evals2/agent-runner.ts b/evals/git-evals2/agent-runner.ts
@@ -5,7 +5,7 @@ import { loadLocalAgents } from '@codebuff/npm-app/agents/load-agents'
 import { CodebuffClient } from '../../sdk/src/client'
 import { withTestRepo } from '../subagents/test-repo-utils'
 
-import type { EvalCommit } from './types'
+import type { EvalCommitV2 } from './types'
 
 export interface AgentStep {
   response: string
@@ -15,6 +15,7 @@ export interface AgentStep {
 
 export interface AgentRunResult {
   diff: string
+  contextFiles: Record<string, string>
   durationMs: number
   cost: number
   error?: string
@@ -30,12 +31,14 @@ export async function runAgentOnCommit({
 }: {
   client: CodebuffClient
   agentId: string
-  commit: EvalCommit
+  commit: EvalCommitV2
   repoUrl: string
   initCommand?: string
 }): Promise<AgentRunResult> {
+  console.log(`[${commit.id}] Running agent ${agentId}...`)
   const startTime = Date.now()
   let diff = ''
+  let contextFiles: Record<string, string> = {}
   let error: string | undefined
   let cost = 0
   const trace: AgentStep[] = []
@@ -56,9 +59,13 @@ export async function runAgentOnCommit({
         let responseText = ''
         let toolCalls: any[] = []
         let toolResults: any[] = []
-        
+
         function flushStep() {
-          if (responseText.length > 0 || toolCalls.length > 0 || toolResults.length > 0) {
+          if (
+            responseText.length > 0 ||
+            toolCalls.length > 0 ||
+            toolResults.length > 0
+          ) {
             trace.push({ response: responseText, toolCalls, toolResults })
             responseText = ''
             toolCalls = []
@@ -68,7 +75,7 @@ export async function runAgentOnCommit({
 
         const result = await client.run({
           agent: agentId,
-          prompt: commit.spec,
+          prompt: commit.prompt,
           agentDefinitions: localAgentDefinitions,
           cwd: repoDir,
           handleEvent: (event) => {
@@ -98,6 +105,27 @@ export async function runAgentOnCommit({
           cwd: repoDir,
           encoding: 'utf-8',
         })
+
+        const contextFilePaths = new Set<string>([
+          ...commit.supplementalFiles,
+          ...commit.fileDiffs.map((fd) => fd.path),
+        ])
+
+        for (const filePath of contextFilePaths) {
+          try {
+            const content = execSync(
+              `git show ${commit.parentSha}:${JSON.stringify(filePath)}`,
+              {
+                cwd: repoDir,
+                encoding: 'utf-8',
+                maxBuffer: 10 * 1024 * 1024,
+              },
+            )
+            contextFiles[filePath] = content
+          } catch (error) {
+            contextFiles[filePath] = ''
+          }
+        }
       },
     )
   } catch (e) {
@@ -108,6 +136,7 @@ export async function runAgentOnCommit({
 
   return {
     diff,
+    contextFiles,
     durationMs,
     cost,
     error,
diff --git a/evals/git-evals2/judge.ts b/evals/git-evals2/judge.ts
@@ -1,23 +1,22 @@
-import { createTwoFilesPatch } from 'diff'
 import { z } from 'zod/v4'
 
-import type { FileState } from './types'
+import type { FileDiff } from './types'
 import type { AgentDefinition } from '../../sdk/src'
 import type { CodebuffClient } from '../../sdk/src/client'
 
 export const JudgingResultSchema = z.object({
   analysis: z
     .string()
     .describe('Detailed analysis comparing agent changes to ground truth'),
-  strengths: z.array(z.string()).describe('Key strengths of the implementation'),
-  weaknesses: z
+  strengths: z
     .array(z.string())
-    .describe('Key weaknesses or issues found'),
+    .describe('Key strengths of the implementation'),
+  weaknesses: z.array(z.string()).describe('Key weaknesses or issues found'),
   completionScore: z
     .number()
     .min(0)
     .max(10)
-    .describe('How completely the spec was implemented'),
+    .describe('How completely the prompt was addressed'),
   codeQualityScore: z
     .number()
     .min(0)
@@ -42,7 +41,8 @@ const judgeAgent: AgentDefinition = {
     properties: {
       analysis: {
         type: 'string',
-        description: 'Detailed analysis comparing agent changes to ground truth',
+        description:
+          'Detailed analysis comparing agent changes to ground truth',
       },
       strengths: {
         type: 'array',
@@ -58,7 +58,7 @@ const judgeAgent: AgentDefinition = {
         type: 'number',
         minimum: 0,
         maximum: 10,
-        description: 'How completely the spec was implemented',
+        description: 'How completely the prompt was addressed',
       },
       codeQualityScore: {
         type: 'number',
@@ -82,60 +82,84 @@ const judgeAgent: AgentDefinition = {
       'overallScore',
     ],
   },
-  systemPrompt: `You are an expert software engineer evaluating AI-generated code changes.
+  systemPrompt: `You are an expert software engineer evaluating AI-generated code changes with empathy for the task given.
 
 ## Your Role
 
 You will receive:
-1. A spec describing what changes should be made
-2. The ground truth changes (expected)
-3. The agent's actual changes
+1. The user prompt that the coding agent was given
+2. Context files from the codebase
+3. The ground truth changes (expected outcome)
+4. The agent's actual changes
+
+## Evaluation Philosophy
+
+**Judge based on what the agent was asked to do, not on perfection.**
+
+- If the prompt is vague or high-level (e.g., "add authentication"), be lenient and accept any reasonable implementation that achieves the goal
+- If the prompt is specific and detailed, expect the implementation to match those details more closely
+- Focus on whether the agent understood and addressed the user's intent
+- Consider that there are often multiple valid ways to implement the same feature
 
 ## Evaluation Criteria
 
-- **Completion** (0-10): How completely was the spec implemented?
+- **Completion** (0-10): How well did the agent address what was asked in the prompt? Consider the specificity of the prompt.
 - **Code Quality** (0-10): How well-structured and maintainable is the code?
-- **Overall** (0-10): Combined quality assessment
+- **Overall** (0-10): Combined assessment of whether the agent successfully completed the task as requested
+
+## Ground Truth
 
-Focus on behavioral equivalence - the implementation doesn't need to be identical to ground truth, but should achieve the same outcome. Valid alternative approaches are acceptable.
+The ground truth shows ONE valid implementation, but it's not the only correct answer. The agent's implementation should be judged on:
+- Does it achieve the same functional outcome?
+- Is it a reasonable approach given the prompt?
+- Does it maintain code quality?
 
 Provide detailed analysis, strengths, weaknesses, and numerical scores.`,
 }
 
 interface JudgeCommitResultInput {
   client: CodebuffClient
-  spec: string
-  groundTruthFileStates: FileState[]
+  prompt: string
+  groundTruthFileDiffs: FileDiff[]
+  contextFiles: Record<string, string>
   agentDiff: string
   error?: string
 }
 
 export async function judgeCommitResult(
   input: JudgeCommitResultInput,
 ): Promise<JudgingResult> {
-  const { client, spec, groundTruthFileStates, agentDiff, error } = input
-
-  const groundTruthDiffs = groundTruthFileStates
-    .map(({ path, preContent, postContent }) => {
-      const diff = createTwoFilesPatch(
-        path,
-        path,
-        preContent,
-        postContent,
-        'before',
-        'after',
-      )
+  const {
+    client,
+    prompt,
+    groundTruthFileDiffs,
+    contextFiles,
+    agentDiff,
+    error,
+  } = input
+
+  const groundTruthDiffs = groundTruthFileDiffs
+    .map(({ path, diff }) => {
       return `### ${path}\n\`\`\`diff\n${diff}\n\`\`\``
     })
     .join('\n\n')
 
-  const judgePrompt = `## Task Specification
-${spec}
+  const contextFilesContent = Object.entries(contextFiles)
+    .map(([filePath, content]) => {
+      return `### ${filePath}\n\`\`\`\n${content}\n\`\`\``
+    })
+    .join('\n\n')
+
+  const judgePrompt = `## User Prompt (What the agent was asked to do)
+${prompt}
+
+## Context Files (from parent commit)
+${contextFilesContent || '(No context files)'}
 
-## Ground Truth Changes (Expected)
+## Ground Truth Changes (One valid implementation)
 ${groundTruthDiffs}
 
-## Agent's Changes (Actual)
+## Agent's Changes (What the agent actually did)
 \`\`\`diff
 ${agentDiff || '(No changes made)'}
 \`\`\`
diff --git a/evals/git-evals2/run-git-evals2.ts b/evals/git-evals2/run-git-evals2.ts
@@ -1,3 +1,4 @@
+import { execSync } from 'child_process'
 import fs from 'fs'
 import path from 'path'
 
@@ -8,7 +9,7 @@ import { CodebuffClient } from '../../sdk/src/client'
 import { runAgentOnCommit } from './agent-runner'
 import { judgeCommitResult } from './judge'
 import { analyzeAgentTraces, type AgentTraceData } from './trace-analyzer'
-import { AgentEvalResults, EvalData, ProgressEvent } from './types'
+import { AgentEvalResults, EvalDataV2, ProgressEvent } from './types'
 
 export async function runGitEvals2(options: {
   evalDataPath: string
@@ -24,7 +25,9 @@ export async function runGitEvals2(options: {
 }> {
   const { evalDataPath, agents, outputPath, limit, onProgress } = options
 
-  const evalData: EvalData = JSON.parse(fs.readFileSync(evalDataPath, 'utf-8'))
+  const evalData: EvalDataV2 = JSON.parse(
+    fs.readFileSync(evalDataPath, 'utf-8'),
+  )
   const commitsToRun = limit
     ? evalData.evalCommits.slice(0, limit)
     : evalData.evalCommits
@@ -59,8 +62,8 @@ export async function runGitEvals2(options: {
   }
 
   for (const commit of commitsToRun) {
-    console.log(`\n=== Evaluating commit ${commit.sha.slice(0, 7)} ===`)
-    console.log(`Spec: ${commit.spec.slice(0, 100)}...`)
+    console.log(`\n=== Evaluating ${commit.id} ===`)
+    console.log(`Prompt: ${commit.prompt.slice(0, 100)}...`)
 
     // Store trace data for this commit to analyze later
     const commitTraces: AgentTraceData[] = []
@@ -83,8 +86,9 @@ export async function runGitEvals2(options: {
 
         const judgeResult = await judgeCommitResult({
           client,
-          spec: commit.spec,
-          groundTruthFileStates: commit.fileStates,
+          prompt: commit.prompt,
+          groundTruthFileDiffs: commit.fileDiffs,
+          contextFiles: agentResult.contextFiles,
           agentDiff: agentResult.diff,
           error: agentResult.error,
         })
@@ -100,13 +104,10 @@ export async function runGitEvals2(options: {
         }
 
         // Save trace to logs directory
-        const safeSpec = commit.spec
-          .split('\n')[0]
-          .replace(/[^a-zA-Z0-9]/g, '_')
-          .slice(0, 20)
+        const safeTaskId = commit.id.replace(/[^a-zA-Z0-9-]/g, '_')
         const safeAgentId = agentId.replace(/[^a-zA-Z0-9-]/g, '_')
         const safeCommitShort = commit.sha.slice(0, 7)
-        const traceFilename = `${safeSpec}-${safeAgentId}-${safeCommitShort}.json`
+        const traceFilename = `${safeTaskId}-${safeAgentId}-${safeCommitShort}.json`
         const tracePath = path.join(logsDir, traceFilename)
 
         const traceData = {
@@ -178,7 +179,7 @@ export async function runGitEvals2(options: {
     // After all agents complete for this commit, run trace analysis
     if (commitTraces.length > 1) {
       console.log(
-        `\n=== Analyzing agent traces for commit ${commit.sha.slice(0, 7)} ===`,
+        `\n=== Analyzing agent traces for ${commit.id} (${commit.sha.slice(0, 7)}) ===`,
       )
       try {
         const analysis = await analyzeAgentTraces({
@@ -188,12 +189,9 @@ export async function runGitEvals2(options: {
         })
 
         // Save analysis to logs directory
-        const safeSpec = commit.spec
-          .split('\n')[0]
-          .replace(/[^a-zA-Z0-9]/g, '_')
-          .slice(0, 30)
-        const safeCommitShort = commit.sha.slice(0, 7)
-        const analysisFilename = `${safeSpec}-ANALYSIS-${safeCommitShort}.json`
+        const safeTaskId = commit.id.replace(/[^a-zA-Z0-9-]/g, '_')
+        const analysisCommitShort = commit.sha.slice(0, 7)
+        const analysisFilename = `${safeTaskId}-ANALYSIS-${analysisCommitShort}.json`
         const analysisPath = path.join(logsDir, analysisFilename)
 
         const analysisData = {