Skip to content

Commit e39e92c

Browse files
committed
Param to run git evals on just spec instead of prompting agent
1 parent 86efd8f commit e39e92c

File tree

3 files changed

+25
-16
lines changed

3 files changed

+25
-16
lines changed

evals/git-evals/run-git-evals.ts

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ export async function runSingleEval(
4242
fingerprintId: string,
4343
codingAgent: 'codebuff' | 'claude',
4444
agent?: string,
45+
promptWithSpec: boolean = false,
4546
): Promise<EvalRunJudged> {
4647
const startTime = new Date()
4748
const trace: CodebuffTrace[] = []
@@ -93,7 +94,7 @@ export async function runSingleEval(
9394

9495
let currentDecision: AgentDecision = 'continue'
9596
let attempts = 0
96-
const MAX_ATTEMPTS = 5
97+
const MAX_ATTEMPTS = promptWithSpec ? 1 : 5
9798

9899
while (currentDecision === 'continue' && attempts < MAX_ATTEMPTS) {
99100
// Check for process-level errors
@@ -119,11 +120,17 @@ export async function runSingleEval(
119120
// Get next prompt from prompting agent with timeout
120121
let agentResponse: z.infer<typeof AgentDecisionSchema>
121122
try {
122-
agentResponse = await promptAiSdkStructured({
123-
messages: [
124-
{
125-
role: 'user',
126-
content: `You are an expert software engineer tasked with implementing a specification using CodeBuff, an AI coding assistant. Your goal is to prompt CodeBuff to implement the spec correctly. You are in a conversation with this coding agent.
123+
agentResponse = promptWithSpec
124+
? {
125+
decision: 'continue',
126+
reasoning: 'Using spec as sole prompt',
127+
next_prompt: evalCommit.spec,
128+
}
129+
: await promptAiSdkStructured({
130+
messages: [
131+
{
132+
role: 'user',
133+
content: `You are an expert software engineer tasked with implementing a specification using CodeBuff, an AI coding assistant. Your goal is to prompt CodeBuff to implement the spec correctly. You are in a conversation with this coding agent.
127134
128135
Current spec to implement:
129136
<spec>${evalCommit.spec}</spec>
@@ -142,16 +149,16 @@ You must decide whether to:
142149
143150
If deciding to continue, include a clear, focused prompt for Codebuff in next_prompt. Note that Codebuff does not have access to the spec, so you must describe the changes you want Codebuff to make in a way that is clear and concise.
144151
Explain your reasoning in detail. Do not ask Codebuff to git commit changes.`,
145-
},
146-
],
147-
schema: AgentDecisionSchema,
148-
model: 'x-ai/grok-4-fast',
149-
clientSessionId,
150-
fingerprintId,
151-
userInputId: generateCompactId(),
152-
userId: undefined,
153-
timeout: 5 * 60_000, // 5 minute timeout
154-
})
152+
},
153+
],
154+
schema: AgentDecisionSchema,
155+
model: 'x-ai/grok-4-fast',
156+
clientSessionId,
157+
fingerprintId,
158+
userInputId: generateCompactId(),
159+
userId: undefined,
160+
timeout: 5 * 60_000, // 5 minute timeout
161+
})
155162
} catch (agentError) {
156163
throw new Error(
157164
`Agent decision failed: ${agentError instanceof Error ? `${agentError.message}\n${JSON.stringify(agentError)}\n${agentError.stack}` : String(agentError)}`,

evals/git-evals/run-single-eval-process.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ async function main() {
7474
fingerprintId,
7575
codingAgent as any,
7676
agent,
77+
false,
7778
)
7879

7980
// Check again after long-running operation

evals/git-evals/run-single-eval.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,7 @@ async function runSingleEvalTask(options: {
199199
fingerprintId,
200200
codingAgent,
201201
agentType,
202+
false,
202203
)
203204

204205
const duration = Date.now() - startTime

0 commit comments

Comments
 (0)