Skip to content

Commit b8fd81c

Browse files
committed
bench: update codex models
1 parent 395daba commit b8fd81c

12 files changed

+110
-106
lines changed

.github/workflows/nightly-terminal-bench.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ jobs:
2323
id: set-models
2424
run: |
2525
if [ "${{ inputs.models }}" = "all" ] || [ -z "${{ inputs.models }}" ]; then
26-
echo 'models=["anthropic:claude-sonnet-4-5","openai:gpt-5-codex"]' >> $GITHUB_OUTPUT
26+
echo 'models=["anthropic:claude-sonnet-4-5","openai:gpt-5.1-codex"]' >> $GITHUB_OUTPUT
2727
else
2828
# Convert comma-separated to JSON array
2929
models="${{ inputs.models }}"

.github/workflows/terminal-bench.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ on:
6161
required: false
6262
type: string
6363
model_name:
64-
description: 'Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)'
64+
description: 'Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5.1-codex)'
6565
required: false
6666
type: string
6767
thinking_level:

src/constants/knownModels.ts

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,20 +66,30 @@ const MODEL_DEFINITIONS = {
6666
GPT_MINI: {
6767
provider: "openai",
6868
providerModelId: "gpt-5.1-codex-mini",
69+
aliases: ["codex-mini"],
6970
},
7071
} as const satisfies Record<string, KnownModelDefinition>;
7172

7273
export type KnownModelKey = keyof typeof MODEL_DEFINITIONS;
74+
const MODEL_DEFINITION_ENTRIES = Object.entries(MODEL_DEFINITIONS) as Array<
75+
[KnownModelKey, KnownModelDefinition]
76+
>;
7377

7478
export const KNOWN_MODELS = Object.fromEntries(
75-
Object.entries(MODEL_DEFINITIONS).map(([key, definition]) => [
79+
MODEL_DEFINITION_ENTRIES.map(([key, definition]) => toKnownModelEntry(key, definition))
80+
);
81+
function toKnownModelEntry<K extends KnownModelKey>(
82+
key: K,
83+
definition: KnownModelDefinition
84+
): [K, KnownModel] {
85+
return [
7686
key,
7787
{
7888
...definition,
79-
id: `${definition.provider}:${definition.providerModelId}` as `${ModelProvider}:${string}`,
89+
id: `${definition.provider}:${definition.providerModelId}`,
8090
},
81-
])
82-
) as Record<KnownModelKey, KnownModel>;
91+
];
92+
}
8393

8494
export function getKnownModel(key: KnownModelKey): KnownModel {
8595
return KNOWN_MODELS[key];

tests/ipcMain/anthropic1MContext.test.ts

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import {
44
createEventCollector,
55
assertStreamSuccess,
66
buildLargeHistory,
7+
modelString,
78
} from "./helpers";
89

910
// Skip all tests if TEST_INTEGRATION is not set
@@ -42,8 +43,7 @@ describeIntegration("IpcMain anthropic 1M context integration tests", () => {
4243
env.mockIpcRenderer,
4344
workspaceId,
4445
"Summarize the context above in one word.",
45-
"anthropic",
46-
"claude-sonnet-4-5",
46+
modelString("anthropic", "claude-sonnet-4-5"),
4747
{
4848
providerOptions: {
4949
anthropic: {
@@ -76,8 +76,7 @@ describeIntegration("IpcMain anthropic 1M context integration tests", () => {
7676
env.mockIpcRenderer,
7777
workspaceId,
7878
"Summarize the context above in one word.",
79-
"anthropic",
80-
"claude-sonnet-4-5",
79+
modelString("anthropic", "claude-sonnet-4-5"),
8180
{
8281
providerOptions: {
8382
anthropic: {

tests/ipcMain/forkWorkspace.test.ts

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import {
1313
createEventCollector,
1414
assertStreamSuccess,
1515
waitFor,
16+
modelString,
1617
} from "./helpers";
1718
import { detectDefaultTrunkBranch } from "../../src/git";
1819
import { HistoryService } from "../../src/services/historyService";
@@ -100,8 +101,7 @@ describeIntegration("IpcMain fork workspace integration tests", () => {
100101
env.mockIpcRenderer,
101102
forkedWorkspaceId,
102103
"What is 2+2? Answer with just the number.",
103-
"anthropic",
104-
"claude-sonnet-4-5"
104+
modelString("anthropic", "claude-sonnet-4-5")
105105
);
106106
expect(sendResult.success).toBe(true);
107107

@@ -154,8 +154,7 @@ describeIntegration("IpcMain fork workspace integration tests", () => {
154154
env.mockIpcRenderer,
155155
forkedWorkspaceId,
156156
"What word did I ask you to remember? Reply with just the word.",
157-
"anthropic",
158-
"claude-sonnet-4-5"
157+
modelString("anthropic", "claude-sonnet-4-5")
159158
);
160159
expect(sendResult.success).toBe(true);
161160

@@ -206,15 +205,13 @@ describeIntegration("IpcMain fork workspace integration tests", () => {
206205
env.mockIpcRenderer,
207206
sourceWorkspaceId,
208207
"What is 5+5? Answer with just the number.",
209-
"anthropic",
210-
"claude-sonnet-4-5"
208+
modelString("anthropic", "claude-sonnet-4-5")
211209
),
212210
sendMessageWithModel(
213211
env.mockIpcRenderer,
214212
forkedWorkspaceId,
215213
"What is 3+3? Answer with just the number.",
216-
"anthropic",
217-
"claude-sonnet-4-5"
214+
modelString("anthropic", "claude-sonnet-4-5")
218215
),
219216
]);
220217

@@ -253,8 +250,7 @@ describeIntegration("IpcMain fork workspace integration tests", () => {
253250
env.mockIpcRenderer,
254251
sourceWorkspaceId,
255252
"Count from 1 to 10, one number per line. Then say 'Done counting.'",
256-
"anthropic",
257-
"claude-sonnet-4-5"
253+
modelString("anthropic", "claude-sonnet-4-5")
258254
);
259255

260256
// Wait for stream to start and produce some content
@@ -286,8 +282,7 @@ describeIntegration("IpcMain fork workspace integration tests", () => {
286282
env.mockIpcRenderer,
287283
forkedWorkspaceId,
288284
"What is 7+3? Answer with just the number.",
289-
"anthropic",
290-
"claude-sonnet-4-5"
285+
modelString("anthropic", "claude-sonnet-4-5")
291286
);
292287
expect(forkedSendResult.success).toBe(true);
293288

tests/ipcMain/helpers.ts

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import * as os from "os";
1414
import { detectDefaultTrunkBranch } from "../../src/git";
1515
import type { TestEnvironment } from "./setup";
1616
import type { RuntimeConfig } from "../../src/types/runtime";
17+
import { KNOWN_MODELS } from "../../src/constants/knownModels";
1718
import type { ToolPolicy } from "../../src/utils/tools/toolPolicy";
1819

1920
// Test constants - centralized for consistency across all tests
@@ -46,6 +47,13 @@ export function modelString(provider: string, model: string): string {
4647
/**
4748
* Send a message via IPC
4849
*/
50+
type SendMessageWithModelOptions = Omit<SendMessageOptions, "model"> & {
51+
imageParts?: Array<{ url: string; mediaType: string }>;
52+
};
53+
54+
const DEFAULT_MODEL_ID = KNOWN_MODELS.SONNET.id;
55+
const DEFAULT_PROVIDER = KNOWN_MODELS.SONNET.provider;
56+
4957
export async function sendMessage(
5058
mockIpcRenderer: IpcRenderer,
5159
workspaceId: string,
@@ -61,19 +69,20 @@ export async function sendMessage(
6169
}
6270

6371
/**
64-
* Send a message with a provider and model (convenience wrapper)
72+
* Send a message with an explicit model id (defaults to SONNET).
6573
*/
6674
export async function sendMessageWithModel(
6775
mockIpcRenderer: IpcRenderer,
6876
workspaceId: string,
6977
message: string,
70-
provider = "anthropic",
71-
model = "claude-sonnet-4-5",
72-
options?: Omit<SendMessageOptions, "model">
78+
modelId: string = DEFAULT_MODEL_ID,
79+
options?: SendMessageWithModelOptions
7380
): Promise<Result<void, SendMessageError>> {
81+
const resolvedModel = modelId.includes(":") ? modelId : modelString(DEFAULT_PROVIDER, modelId);
82+
7483
return sendMessage(mockIpcRenderer, workspaceId, message, {
7584
...options,
76-
model: modelString(provider, model),
85+
model: resolvedModel,
7786
});
7887
}
7988

tests/ipcMain/modelNotFound.test.ts

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { setupWorkspace, shouldRunIntegrationTests, validateApiKeys } from "./setup";
2-
import { sendMessageWithModel, createEventCollector, waitFor } from "./helpers";
2+
import { sendMessageWithModel, createEventCollector, waitFor, modelString } from "./helpers";
33
import { IPC_CHANNELS } from "../../src/constants/ipc-constants";
44
import type { Result } from "../../src/types/result";
55
import type { SendMessageError } from "../../src/types/errors";
@@ -30,8 +30,7 @@ describeIntegration("IpcMain model_not_found error handling", () => {
3030
env.mockIpcRenderer,
3131
workspaceId,
3232
"Hello",
33-
"anthropic",
34-
"invalid-model-that-does-not-exist-xyz123"
33+
modelString("anthropic", "invalid-model-that-does-not-exist-xyz123")
3534
);
3635

3736
// Collect events to verify error classification
@@ -69,8 +68,7 @@ describeIntegration("IpcMain model_not_found error handling", () => {
6968
env.mockIpcRenderer,
7069
workspaceId,
7170
"Hello",
72-
"openai",
73-
"gpt-nonexistent-model-xyz123"
71+
modelString("openai", "gpt-nonexistent-model-xyz123")
7472
);
7573

7674
// Collect events to verify error classification

tests/ipcMain/ollama.test.ts

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import {
44
createEventCollector,
55
assertStreamSuccess,
66
extractTextFromEvents,
7+
modelString,
78
} from "./helpers";
89
import { spawn } from "child_process";
910

@@ -105,8 +106,7 @@ describeOllama("IpcMain Ollama integration tests", () => {
105106
env.mockIpcRenderer,
106107
workspaceId,
107108
"Say 'hello' and nothing else",
108-
"ollama",
109-
OLLAMA_MODEL
109+
modelString("ollama", OLLAMA_MODEL)
110110
);
111111

112112
// Verify the IPC call succeeded
@@ -139,8 +139,7 @@ describeOllama("IpcMain Ollama integration tests", () => {
139139
env.mockIpcRenderer,
140140
workspaceId,
141141
"What is the current date and time? Use the bash tool to find out.",
142-
"ollama",
143-
OLLAMA_MODEL
142+
modelString("ollama", OLLAMA_MODEL)
144143
);
145144

146145
expect(result.success).toBe(true);
@@ -178,8 +177,7 @@ describeOllama("IpcMain Ollama integration tests", () => {
178177
env.mockIpcRenderer,
179178
workspaceId,
180179
"Read the README.md file and tell me what the first heading says.",
181-
"ollama",
182-
OLLAMA_MODEL
180+
modelString("ollama", OLLAMA_MODEL)
183181
);
184182

185183
expect(result.success).toBe(true);
@@ -216,8 +214,7 @@ describeOllama("IpcMain Ollama integration tests", () => {
216214
env.mockIpcRenderer,
217215
workspaceId,
218216
"This should fail",
219-
"ollama",
220-
OLLAMA_MODEL,
217+
modelString("ollama", OLLAMA_MODEL),
221218
{
222219
providerOptions: {
223220
ollama: {},

tests/ipcMain/openai-web-search.test.ts

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
import { setupWorkspace, shouldRunIntegrationTests, validateApiKeys } from "./setup";
2-
import { sendMessageWithModel, createEventCollector, assertStreamSuccess } from "./helpers";
2+
import {
3+
sendMessageWithModel,
4+
createEventCollector,
5+
assertStreamSuccess,
6+
modelString,
7+
} from "./helpers";
38

49
// Skip all tests if TEST_INTEGRATION is not set
510
const describeIntegration = shouldRunIntegrationTests() ? describe : describe.skip;
@@ -32,8 +37,7 @@ describeIntegration("OpenAI web_search integration tests", () => {
3237
workspaceId,
3338
"Use web search to find the current weather in San Francisco. " +
3439
"Then tell me if it's a good day for a picnic.",
35-
"openai",
36-
"gpt-5-codex",
40+
modelString("openai", "gpt-5.1-codex-mini"),
3741
{
3842
thinkingLevel: "medium", // Ensure reasoning without excessive deliberation
3943
}

tests/ipcMain/resumeStream.test.ts

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { setupWorkspace, shouldRunIntegrationTests, validateApiKeys } from "./setup";
2-
import { sendMessageWithModel, createEventCollector, waitFor } from "./helpers";
2+
import { sendMessageWithModel, createEventCollector, waitFor, modelString } from "./helpers";
33
import { IPC_CHANNELS } from "../../src/constants/ipc-constants";
44
import type { Result } from "../../src/types/result";
55
import type { SendMessageError } from "../../src/types/errors";
@@ -31,8 +31,7 @@ describeIntegration("IpcMain resumeStream integration tests", () => {
3131
env.mockIpcRenderer,
3232
workspaceId,
3333
`Run this bash command: for i in 1 2 3; do sleep 0.5; done && echo '${expectedWord}'`,
34-
"anthropic",
35-
"claude-sonnet-4-5"
34+
modelString("anthropic", "claude-sonnet-4-5")
3635
);
3736

3837
// Wait for stream to start

0 commit comments

Comments
 (0)