Param to run git evals on just spec instead of prompting agent

jahooma · jahooma · commit e39e92c824f4 · 2025-10-08T11:07:43.000-07:00
diff --git a/evals/git-evals/run-git-evals.ts b/evals/git-evals/run-git-evals.ts
@@ -42,6 +42,7 @@ export async function runSingleEval(
   fingerprintId: string,
   codingAgent: 'codebuff' | 'claude',
   agent?: string,
+  promptWithSpec: boolean = false,
 ): Promise<EvalRunJudged> {
   const startTime = new Date()
   const trace: CodebuffTrace[] = []
@@ -93,7 +94,7 @@ export async function runSingleEval(
 
     let currentDecision: AgentDecision = 'continue'
     let attempts = 0
-    const MAX_ATTEMPTS = 5
+    const MAX_ATTEMPTS = promptWithSpec ? 1 : 5
 
     while (currentDecision === 'continue' && attempts < MAX_ATTEMPTS) {
       // Check for process-level errors
@@ -119,11 +120,17 @@ export async function runSingleEval(
       // Get next prompt from prompting agent with timeout
       let agentResponse: z.infer<typeof AgentDecisionSchema>
       try {
-        agentResponse = await promptAiSdkStructured({
-          messages: [
-            {
-              role: 'user',
-              content: `You are an expert software engineer tasked with implementing a specification using CodeBuff, an AI coding assistant. Your goal is to prompt CodeBuff to implement the spec correctly. You are in a conversation with this coding agent.
+        agentResponse = promptWithSpec
+          ? {
+              decision: 'continue',
+              reasoning: 'Using spec as sole prompt',
+              next_prompt: evalCommit.spec,
+            }
+          : await promptAiSdkStructured({
+              messages: [
+                {
+                  role: 'user',
+                  content: `You are an expert software engineer tasked with implementing a specification using CodeBuff, an AI coding assistant. Your goal is to prompt CodeBuff to implement the spec correctly. You are in a conversation with this coding agent.
 
 Current spec to implement:
 <spec>${evalCommit.spec}</spec>
@@ -142,16 +149,16 @@ You must decide whether to:
 
 If deciding to continue, include a clear, focused prompt for Codebuff in next_prompt. Note that Codebuff does not have access to the spec, so you must describe the changes you want Codebuff to make in a way that is clear and concise.
 Explain your reasoning in detail. Do not ask Codebuff to git commit changes.`,
-            },
-          ],
-          schema: AgentDecisionSchema,
-          model: 'x-ai/grok-4-fast',
-          clientSessionId,
-          fingerprintId,
-          userInputId: generateCompactId(),
-          userId: undefined,
-          timeout: 5 * 60_000, // 5 minute timeout
-        })
+                },
+              ],
+              schema: AgentDecisionSchema,
+              model: 'x-ai/grok-4-fast',
+              clientSessionId,
+              fingerprintId,
+              userInputId: generateCompactId(),
+              userId: undefined,
+              timeout: 5 * 60_000, // 5 minute timeout
+            })
       } catch (agentError) {
         throw new Error(
           `Agent decision failed: ${agentError instanceof Error ? `${agentError.message}\n${JSON.stringify(agentError)}\n${agentError.stack}` : String(agentError)}`,
diff --git a/evals/git-evals/run-single-eval-process.ts b/evals/git-evals/run-single-eval-process.ts
@@ -74,6 +74,7 @@ async function main() {
       fingerprintId,
       codingAgent as any,
       agent,
+      false,
     )
 
     // Check again after long-running operation
diff --git a/evals/git-evals/run-single-eval.ts b/evals/git-evals/run-single-eval.ts
@@ -199,6 +199,7 @@ async function runSingleEvalTask(options: {
       fingerprintId,
       codingAgent,
       agentType,
+      false,
     )
 
     const duration = Date.now() - startTime

Original file line number	Diff line number	Diff line change
`@@ -74,6 +74,7 @@ async function main() {`
`74`	`74`	`fingerprintId,`
`75`	`75`	`codingAgent as any,`
`76`	`76`	`agent,`
	`77`	`+ false,`
`77`	`78`	`)`
`78`	`79`
`79`	`80`	`// Check again after long-running operation`
Original file line number	Diff line number	Diff line change
`@@ -199,6 +199,7 @@ async function runSingleEvalTask(options: {`
`199`	`199`	`fingerprintId,`
`200`	`200`	`codingAgent,`
`201`	`201`	`agentType,`
	`202`	`+ false,`
`202`	`203`	`)`
`203`	`204`
`204`	`205`	`const duration = Date.now() - startTime`