CodebuffAI
diff --git a/‎evals/git-evals/eval-codebuff2.json‎
Lines changed: 1 addition & 6 deletions b/‎evals/git-evals/eval-codebuff2.json‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎evals/git-evals/logs/codebuff-yw_Q5Gr1Tls/eval-commit-212590d.json‎
Lines changed: 0 additions & 1 deletion b/‎evals/git-evals/logs/codebuff-yw_Q5Gr1Tls/eval-commit-212590d.json‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎evals/git-evals2/README.md‎
Lines changed: 126 additions & 0 deletions b/‎evals/git-evals2/README.md‎
Lines changed: 126 additions & 0 deletions
diff --git a/‎evals/git-evals2/agent-runner.ts‎
Lines changed: 77 additions & 0 deletions b/‎evals/git-evals2/agent-runner.ts‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎evals/git-evals2/example.ts‎
Lines changed: 48 additions & 0 deletions b/‎evals/git-evals2/example.ts‎
Lines changed: 48 additions & 0 deletions
@@ -0,0 +1,126 @@
+# git-evals2
+
+A simplified evaluation system for comparing Codebuff agents on git commit tasks.
+
+## Overview
+
+git-evals2 is a streamlined rewrite of the original git-evals system, inspired by the subagents evals (eval-planner and test-repo-utils). It focuses on simplicity and ease of use while maintaining the core functionality of agent evaluation.
+
+## Key Simplifications
+
+Compared to the original git-evals:
+
+- **No child processes**: Runs everything in-process with async/await
+- **No prompting agent**: Single-shot execution - agent gets the spec once and runs until done
+- **Codebuff agents only**: Uses the SDK client exclusively (no Claude runner)
+- **No trace in judging**: Judge only sees final file changes vs ground truth (not agent execution steps)
+- **Function-based API**: Simple exported function instead of CLI with complex process management
+- **Minimal metadata**: Only tracks essential metrics (diff, duration, cost, optional error)
+
+## Usage
+
+```typescript
+import { runGitEvals2 } from './evals/git-evals2/run-git-evals2'
+
+const results = await runGitEvals2({
+  evalDataPath: 'evals/git-evals/eval-codebuff2.json',
+  agents: ['base', 'base-lite'],
+  outputPath: 'evals/git-evals2/results.json',
+  limit: 5,
+  onProgress: (event) => {
+    if (event.type === 'agent_complete') {
+      console.log(`${event.agent} completed with score ${event.score}`)
+    }
+  },
+})
+
+console.log('Average scores:', {
+  base: results.agents.get('base')?.averageScore,
+  'base-lite': results.agents.get('base-lite')?.averageScore,
+})
+```
+
+## API
+
+### `runGitEvals2(options: GitEvals2Options): Promise<GitEvals2Result>`
+
+#### Options
+
+- `evalDataPath` (string): Path to eval JSON file with commits
+- `agents` (string[]): Array of agent IDs to compare (e.g., ['base', 'base-lite'])
+- `outputPath?` (string): Optional path to write results JSON
+- `limit?` (number): Optional max number of commits to evaluate
+- `onProgress?` (callback): Optional progress event handler
+- `client?` (CodebuffClient): Optional SDK client override (useful for testing)
+
+#### Result
+
+```typescript
+interface GitEvals2Result {
+  agents: Map<string, AgentEvalResults>
+  timestamp: string
+  totalDuration: number
+}
+
+interface AgentEvalResults {
+  agentId: string
+  runs: EvalRun[]
+  averageScore: number
+  averageCost: number
+  averageDuration: number
+}
+
+interface EvalRun {
+  commitSha: string
+  spec: string
+  diff: string
+  judgeScore: number
+  judgeFeedback: string
+  cost: number
+  durationMs: number
+  error?: string
+}
+```
+
+## How It Differs
+
+### Architecture
+
+- **Original**: Fork child processes for each eval, complex IPC communication
+- **git-evals2**: Simple async functions with Promise.all for parallelism
+
+### Execution
+
+- **Original**: Multi-turn conversations with prompting agent deciding continue/complete/halt
+- **git-evals2**: Single-shot - agent gets spec and runs until done or timeout
+
+### Judging
+
+- **Original**: Judge sees spec + agent trace + final diff, 3 judges with median selection
+- **git-evals2**: Judge only sees spec + final diff (no trace), single judge call
+
+### State Management
+
+- **Original**: Complex SessionState threading, manual state updates
+- **git-evals2**: SDK handles state internally, minimal metadata tracking
+
+### Error Handling
+
+- **Original**: Process-level handlers, signal management, cleanup logic
+- **git-evals2**: Standard try-catch, continues on errors, records them in results
+
+## Module Structure
+
+- `run-git-evals2.ts`: Main orchestration function
+- `agent-runner.ts`: Executes single agent on a commit
+- `judge.ts`: Judges file changes without trace
+- `types.ts`: Type definitions
+- `example.ts`: Example usage
+
+## Benefits
+
+- **Simpler codebase**: ~90% less code than original system
+- **Faster execution**: Less overhead from process management
+- **Easier debugging**: Everything in-process with standard async/await
+- **More maintainable**: Clear separation of concerns, modular design
+- **Still powerful**: Maintains core evaluation functionality
@@ -0,0 +1,77 @@
+import { execSync } from 'child_process'
+import path from 'path'
+
+import { loadLocalAgents } from '@codebuff/npm-app/agents/load-agents'
+import { CodebuffClient } from '../../sdk/src/client'
+import { withTestRepo } from '../subagents/test-repo-utils'
+
+import type { EvalCommit } from './types'
+
+export interface AgentRunResult {
+  diff: string
+  durationMs: number
+  cost: number
+  error?: string
+}
+
+export async function runAgentOnCommit({
+  client,
+  agentId,
+  commit,
+  repoUrl,
+  initCommand,
+}: {
+  client: CodebuffClient
+  agentId: string
+  commit: EvalCommit
+  repoUrl: string
+  initCommand?: string
+}): Promise<AgentRunResult> {
+  const startTime = Date.now()
+  let diff = ''
+  let error: string | undefined
+  let cost = 0
+
+  try {
+    await withTestRepo(
+      {
+        repoUrl,
+        commitSha: commit.sha,
+        initCommand,
+        checkoutPrevious: true,
+      },
+      async (repoDir) => {
+        const agentsPath = path.join(__dirname, '../../.agents')
+        const localAgentDefinitions = Object.values(
+          await loadLocalAgents({ agentsPath }),
+        )
+
+        const result = await client.run({
+          agent: agentId,
+          prompt: commit.spec,
+          agentDefinitions: localAgentDefinitions,
+          cwd: repoDir,
+        })
+
+        cost = result.sessionState.mainAgentState.creditsUsed / 100
+
+        execSync('git add .', { cwd: repoDir, stdio: 'ignore' })
+        diff = execSync('git diff HEAD', {
+          cwd: repoDir,
+          encoding: 'utf-8',
+        })
+      },
+    )
+  } catch (e) {
+    error = e instanceof Error ? `${e.message}\n${e.stack}` : String(e)
+  }
+
+  const durationMs = Date.now() - startTime
+
+  return {
+    diff,
+    durationMs,
+    cost,
+    error,
+  }
+}
@@ -0,0 +1,48 @@
+import path from 'path'
+import { runGitEvals2 } from './run-git-evals2'
+
+async function main() {
+  console.log('Running git-evals2 example...')
+  console.log('Comparing base and base-lite agents on first 3 commits\n')
+
+  const results = await runGitEvals2({
+    evalDataPath: path.join(__dirname, '../git-evals/eval-codebuff2.json'),
+    agents: ['base', 'base-lite'],
+    outputPath: path.join(__dirname, '../git-evals2/example-results.json'),
+    limit: 3,
+    onProgress: (event) => {
+      if (event.type === 'agent_start') {
+        console.log(
+          `[${event.agent}] Starting on commit ${event.commit.slice(0, 7)}...`,
+        )
+      } else if (event.type === 'agent_complete') {
+        console.log(
+          `[${event.agent}] ✓ Completed with score ${event.score.toFixed(1)}/10`,
+        )
+      } else if (event.type === 'agent_error') {
+        console.log(`[${event.agent}] ✗ Error: ${event.error}`)
+      }
+    },
+  })
+
+  console.log('\n=== Final Results ===')
+  console.log(`Total duration: ${(results.totalDuration / 1000).toFixed(1)}s\n`)
+
+  for (const [agentId, data] of results.agents) {
+    console.log(`${agentId}:`)
+    console.log(`  Score: ${data.averageScore.toFixed(2)}/10`)
+    console.log(`  Cost: $${data.averageCost.toFixed(4)}`)
+    console.log(`  Duration: ${(data.averageDuration / 1000).toFixed(1)}s`)
+    console.log(
+      `  Success: ${data.runs.filter((r) => !r.error).length}/${data.runs.length}`,
+    )
+    console.log()
+  }
+}
+
+if (import.meta.main) {
+  main().catch((error) => {
+    console.error('Error running example:', error)
+    process.exit(1)
+  })
+}