Skip to content

Commit 9455b47

Browse files
committed
bench: update codex models
1 parent 395daba commit 9455b47

12 files changed

+92
-118
lines changed

.github/workflows/nightly-terminal-bench.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ jobs:
2323
id: set-models
2424
run: |
2525
if [ "${{ inputs.models }}" = "all" ] || [ -z "${{ inputs.models }}" ]; then
26-
echo 'models=["anthropic:claude-sonnet-4-5","openai:gpt-5-codex"]' >> $GITHUB_OUTPUT
26+
echo 'models=["anthropic:claude-sonnet-4-5","openai:gpt-5.1-codex"]' >> $GITHUB_OUTPUT
2727
else
2828
# Convert comma-separated to JSON array
2929
models="${{ inputs.models }}"

.github/workflows/terminal-bench.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ on:
6161
required: false
6262
type: string
6363
model_name:
64-
description: 'Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)'
64+
description: 'Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5.1-codex)'
6565
required: false
6666
type: string
6767
thinking_level:

src/constants/knownModels.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,13 +66,18 @@ const MODEL_DEFINITIONS = {
6666
GPT_MINI: {
6767
provider: "openai",
6868
providerModelId: "gpt-5.1-codex-mini",
69+
aliases: ["codex-mini"],
6970
},
7071
} as const satisfies Record<string, KnownModelDefinition>;
7172

7273
export type KnownModelKey = keyof typeof MODEL_DEFINITIONS;
74+
const MODEL_DEFINITION_ENTRIES = Object.entries(MODEL_DEFINITIONS) as Array<
75+
[KnownModelKey, KnownModelDefinition]
76+
>;
77+
7378

7479
export const KNOWN_MODELS = Object.fromEntries(
75-
Object.entries(MODEL_DEFINITIONS).map(([key, definition]) => [
80+
MODEL_DEFINITION_ENTRIES.map(([key, definition]) => [
7681
key,
7782
{
7883
...definition,

tests/ipcMain/anthropic1MContext.test.ts

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import {
44
createEventCollector,
55
assertStreamSuccess,
66
buildLargeHistory,
7+
modelString,
78
} from "./helpers";
89

910
// Skip all tests if TEST_INTEGRATION is not set
@@ -42,8 +43,7 @@ describeIntegration("IpcMain anthropic 1M context integration tests", () => {
4243
env.mockIpcRenderer,
4344
workspaceId,
4445
"Summarize the context above in one word.",
45-
"anthropic",
46-
"claude-sonnet-4-5",
46+
modelString("anthropic", "claude-sonnet-4-5"),
4747
{
4848
providerOptions: {
4949
anthropic: {
@@ -76,8 +76,7 @@ describeIntegration("IpcMain anthropic 1M context integration tests", () => {
7676
env.mockIpcRenderer,
7777
workspaceId,
7878
"Summarize the context above in one word.",
79-
"anthropic",
80-
"claude-sonnet-4-5",
79+
modelString("anthropic", "claude-sonnet-4-5"),
8180
{
8281
providerOptions: {
8382
anthropic: {

tests/ipcMain/forkWorkspace.test.ts

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import {
1313
createEventCollector,
1414
assertStreamSuccess,
1515
waitFor,
16+
modelString,
1617
} from "./helpers";
1718
import { detectDefaultTrunkBranch } from "../../src/git";
1819
import { HistoryService } from "../../src/services/historyService";
@@ -100,9 +101,8 @@ describeIntegration("IpcMain fork workspace integration tests", () => {
100101
env.mockIpcRenderer,
101102
forkedWorkspaceId,
102103
"What is 2+2? Answer with just the number.",
103-
"anthropic",
104-
"claude-sonnet-4-5"
105-
);
104+
modelString("anthropic", "claude-sonnet-4-5")
105+
);
106106
expect(sendResult.success).toBe(true);
107107

108108
// Verify stream completes successfully
@@ -154,9 +154,8 @@ describeIntegration("IpcMain fork workspace integration tests", () => {
154154
env.mockIpcRenderer,
155155
forkedWorkspaceId,
156156
"What word did I ask you to remember? Reply with just the word.",
157-
"anthropic",
158-
"claude-sonnet-4-5"
159-
);
157+
modelString("anthropic", "claude-sonnet-4-5")
158+
);
160159
expect(sendResult.success).toBe(true);
161160

162161
// Verify stream completes successfully
@@ -206,16 +205,12 @@ describeIntegration("IpcMain fork workspace integration tests", () => {
206205
env.mockIpcRenderer,
207206
sourceWorkspaceId,
208207
"What is 5+5? Answer with just the number.",
209-
"anthropic",
210-
"claude-sonnet-4-5"
211-
),
208+
modelString("anthropic", "claude-sonnet-4-5")),
212209
sendMessageWithModel(
213210
env.mockIpcRenderer,
214211
forkedWorkspaceId,
215212
"What is 3+3? Answer with just the number.",
216-
"anthropic",
217-
"claude-sonnet-4-5"
218-
),
213+
modelString("anthropic", "claude-sonnet-4-5")),
219214
]);
220215

221216
expect(sourceResult.success).toBe(true);
@@ -253,9 +248,8 @@ describeIntegration("IpcMain fork workspace integration tests", () => {
253248
env.mockIpcRenderer,
254249
sourceWorkspaceId,
255250
"Count from 1 to 10, one number per line. Then say 'Done counting.'",
256-
"anthropic",
257-
"claude-sonnet-4-5"
258-
);
251+
modelString("anthropic", "claude-sonnet-4-5")
252+
);
259253

260254
// Wait for stream to start and produce some content
261255
const sourceCollector = createEventCollector(env.sentEvents, sourceWorkspaceId);
@@ -286,9 +280,8 @@ describeIntegration("IpcMain fork workspace integration tests", () => {
286280
env.mockIpcRenderer,
287281
forkedWorkspaceId,
288282
"What is 7+3? Answer with just the number.",
289-
"anthropic",
290-
"claude-sonnet-4-5"
291-
);
283+
modelString("anthropic", "claude-sonnet-4-5")
284+
);
292285
expect(forkedSendResult.success).toBe(true);
293286

294287
// Verify forked workspace stream completes successfully

tests/ipcMain/helpers.ts

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import * as os from "os";
1414
import { detectDefaultTrunkBranch } from "../../src/git";
1515
import type { TestEnvironment } from "./setup";
1616
import type { RuntimeConfig } from "../../src/types/runtime";
17+
import { KNOWN_MODELS } from "../../src/constants/knownModels";
1718
import type { ToolPolicy } from "../../src/utils/tools/toolPolicy";
1819

1920
// Test constants - centralized for consistency across all tests
@@ -46,6 +47,13 @@ export function modelString(provider: string, model: string): string {
4647
/**
4748
* Send a message via IPC
4849
*/
50+
type SendMessageWithModelOptions = Omit<SendMessageOptions, "model"> & {
51+
imageParts?: Array<{ url: string; mediaType: string }>;
52+
};
53+
54+
const DEFAULT_MODEL_ID = KNOWN_MODELS.SONNET.id;
55+
const DEFAULT_PROVIDER = KNOWN_MODELS.SONNET.provider;
56+
4957
export async function sendMessage(
5058
mockIpcRenderer: IpcRenderer,
5159
workspaceId: string,
@@ -61,19 +69,22 @@ export async function sendMessage(
6169
}
6270

6371
/**
64-
* Send a message with a provider and model (convenience wrapper)
72+
* Send a message with an explicit model id (defaults to SONNET).
6573
*/
6674
export async function sendMessageWithModel(
6775
mockIpcRenderer: IpcRenderer,
6876
workspaceId: string,
6977
message: string,
70-
provider = "anthropic",
71-
model = "claude-sonnet-4-5",
72-
options?: Omit<SendMessageOptions, "model">
78+
modelId: string = DEFAULT_MODEL_ID,
79+
options?: SendMessageWithModelOptions
7380
): Promise<Result<void, SendMessageError>> {
81+
const resolvedModel = modelId.includes(":")
82+
? modelId
83+
: modelString(DEFAULT_PROVIDER, modelId);
84+
7485
return sendMessage(mockIpcRenderer, workspaceId, message, {
7586
...options,
76-
model: modelString(provider, model),
87+
model: resolvedModel,
7788
});
7889
}
7990

tests/ipcMain/modelNotFound.test.ts

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { setupWorkspace, shouldRunIntegrationTests, validateApiKeys } from "./setup";
2-
import { sendMessageWithModel, createEventCollector, waitFor } from "./helpers";
2+
import { sendMessageWithModel, createEventCollector, waitFor, modelString } from "./helpers";
33
import { IPC_CHANNELS } from "../../src/constants/ipc-constants";
44
import type { Result } from "../../src/types/result";
55
import type { SendMessageError } from "../../src/types/errors";
@@ -30,9 +30,8 @@ describeIntegration("IpcMain model_not_found error handling", () => {
3030
env.mockIpcRenderer,
3131
workspaceId,
3232
"Hello",
33-
"anthropic",
34-
"invalid-model-that-does-not-exist-xyz123"
35-
);
33+
modelString("anthropic", "invalid-model-that-does-not-exist-xyz123")
34+
);
3635

3736
// Collect events to verify error classification
3837
const collector = createEventCollector(env.sentEvents, workspaceId);
@@ -69,9 +68,8 @@ describeIntegration("IpcMain model_not_found error handling", () => {
6968
env.mockIpcRenderer,
7069
workspaceId,
7170
"Hello",
72-
"openai",
73-
"gpt-nonexistent-model-xyz123"
74-
);
71+
modelString("openai", "gpt-nonexistent-model-xyz123")
72+
);
7573

7674
// Collect events to verify error classification
7775
const collector = createEventCollector(env.sentEvents, workspaceId);

tests/ipcMain/ollama.test.ts

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import {
44
createEventCollector,
55
assertStreamSuccess,
66
extractTextFromEvents,
7+
modelString,
78
} from "./helpers";
89
import { spawn } from "child_process";
910

@@ -105,9 +106,8 @@ describeOllama("IpcMain Ollama integration tests", () => {
105106
env.mockIpcRenderer,
106107
workspaceId,
107108
"Say 'hello' and nothing else",
108-
"ollama",
109-
OLLAMA_MODEL
110-
);
109+
modelString("ollama", OLLAMA_MODEL)
110+
);
111111

112112
// Verify the IPC call succeeded
113113
expect(result.success).toBe(true);
@@ -139,9 +139,8 @@ describeOllama("IpcMain Ollama integration tests", () => {
139139
env.mockIpcRenderer,
140140
workspaceId,
141141
"What is the current date and time? Use the bash tool to find out.",
142-
"ollama",
143-
OLLAMA_MODEL
144-
);
142+
modelString("ollama", OLLAMA_MODEL)
143+
);
145144

146145
expect(result.success).toBe(true);
147146

@@ -178,9 +177,8 @@ describeOllama("IpcMain Ollama integration tests", () => {
178177
env.mockIpcRenderer,
179178
workspaceId,
180179
"Read the README.md file and tell me what the first heading says.",
181-
"ollama",
182-
OLLAMA_MODEL
183-
);
180+
modelString("ollama", OLLAMA_MODEL)
181+
);
184182

185183
expect(result.success).toBe(true);
186184

@@ -216,8 +214,7 @@ describeOllama("IpcMain Ollama integration tests", () => {
216214
env.mockIpcRenderer,
217215
workspaceId,
218216
"This should fail",
219-
"ollama",
220-
OLLAMA_MODEL,
217+
modelString("ollama", OLLAMA_MODEL),
221218
{
222219
providerOptions: {
223220
ollama: {},

tests/ipcMain/openai-web-search.test.ts

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { setupWorkspace, shouldRunIntegrationTests, validateApiKeys } from "./setup";
2-
import { sendMessageWithModel, createEventCollector, assertStreamSuccess } from "./helpers";
2+
import { sendMessageWithModel, createEventCollector, assertStreamSuccess, modelString } from "./helpers";
33

44
// Skip all tests if TEST_INTEGRATION is not set
55
const describeIntegration = shouldRunIntegrationTests() ? describe : describe.skip;
@@ -32,8 +32,7 @@ describeIntegration("OpenAI web_search integration tests", () => {
3232
workspaceId,
3333
"Use web search to find the current weather in San Francisco. " +
3434
"Then tell me if it's a good day for a picnic.",
35-
"openai",
36-
"gpt-5-codex",
35+
modelString("openai", "gpt-5.1-codex-mini"),
3736
{
3837
thinkingLevel: "medium", // Ensure reasoning without excessive deliberation
3938
}

tests/ipcMain/resumeStream.test.ts

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { setupWorkspace, shouldRunIntegrationTests, validateApiKeys } from "./setup";
2-
import { sendMessageWithModel, createEventCollector, waitFor } from "./helpers";
2+
import { sendMessageWithModel, createEventCollector, waitFor, modelString } from "./helpers";
33
import { IPC_CHANNELS } from "../../src/constants/ipc-constants";
44
import type { Result } from "../../src/types/result";
55
import type { SendMessageError } from "../../src/types/errors";
@@ -31,9 +31,8 @@ describeIntegration("IpcMain resumeStream integration tests", () => {
3131
env.mockIpcRenderer,
3232
workspaceId,
3333
`Run this bash command: for i in 1 2 3; do sleep 0.5; done && echo '${expectedWord}'`,
34-
"anthropic",
35-
"claude-sonnet-4-5"
36-
);
34+
modelString("anthropic", "claude-sonnet-4-5")
35+
);
3736

3837
// Wait for stream to start
3938
const collector1 = createEventCollector(env.sentEvents, workspaceId);

0 commit comments

Comments
 (0)