diff --git a/.github/workflows/nightly-terminal-bench.yml b/.github/workflows/nightly-terminal-bench.yml index 70226826bb..2d17305143 100644 --- a/.github/workflows/nightly-terminal-bench.yml +++ b/.github/workflows/nightly-terminal-bench.yml @@ -23,7 +23,7 @@ jobs: id: set-models run: | if [ "${{ inputs.models }}" = "all" ] || [ -z "${{ inputs.models }}" ]; then - echo 'models=["anthropic:claude-sonnet-4-5","openai:gpt-5-codex"]' >> $GITHUB_OUTPUT + echo 'models=["anthropic:claude-sonnet-4-5","openai:gpt-5.1-codex"]' >> $GITHUB_OUTPUT else # Convert comma-separated to JSON array models="${{ inputs.models }}" diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml index 9f68ca0a75..f74b271bf9 100644 --- a/.github/workflows/terminal-bench.yml +++ b/.github/workflows/terminal-bench.yml @@ -61,7 +61,7 @@ on: required: false type: string model_name: - description: 'Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)' + description: 'Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5.1-codex)' required: false type: string thinking_level: diff --git a/src/constants/knownModels.ts b/src/constants/knownModels.ts index 254a91b8ec..8d44198d9d 100644 --- a/src/constants/knownModels.ts +++ b/src/constants/knownModels.ts @@ -66,20 +66,30 @@ const MODEL_DEFINITIONS = { GPT_MINI: { provider: "openai", providerModelId: "gpt-5.1-codex-mini", + aliases: ["codex-mini"], }, } as const satisfies Record; export type KnownModelKey = keyof typeof MODEL_DEFINITIONS; +const MODEL_DEFINITION_ENTRIES = Object.entries(MODEL_DEFINITIONS) as Array< + [KnownModelKey, KnownModelDefinition] +>; export const KNOWN_MODELS = Object.fromEntries( - Object.entries(MODEL_DEFINITIONS).map(([key, definition]) => [ + MODEL_DEFINITION_ENTRIES.map(([key, definition]) => toKnownModelEntry(key, definition)) +); +function toKnownModelEntry( + key: K, + definition: KnownModelDefinition +): [K, KnownModel] { + return [ key, { ...definition, - id: `${definition.provider}:${definition.providerModelId}` as `${ModelProvider}:${string}`, + id: `${definition.provider}:${definition.providerModelId}`, }, - ]) -) as Record; + ]; +} export function getKnownModel(key: KnownModelKey): KnownModel { return KNOWN_MODELS[key]; diff --git a/tests/ipcMain/anthropic1MContext.test.ts b/tests/ipcMain/anthropic1MContext.test.ts index f3c0d6fcdb..21a57ff9f7 100644 --- a/tests/ipcMain/anthropic1MContext.test.ts +++ b/tests/ipcMain/anthropic1MContext.test.ts @@ -4,6 +4,7 @@ import { createEventCollector, assertStreamSuccess, buildLargeHistory, + modelString, } from "./helpers"; // Skip all tests if TEST_INTEGRATION is not set @@ -42,8 +43,7 @@ describeIntegration("IpcMain anthropic 1M context integration tests", () => { env.mockIpcRenderer, workspaceId, "Summarize the context above in one word.", - "anthropic", - "claude-sonnet-4-5", + modelString("anthropic", "claude-sonnet-4-5"), { providerOptions: { anthropic: { @@ -76,8 +76,7 @@ describeIntegration("IpcMain anthropic 1M context integration tests", () => { env.mockIpcRenderer, workspaceId, "Summarize the context above in one word.", - "anthropic", - "claude-sonnet-4-5", + modelString("anthropic", "claude-sonnet-4-5"), { providerOptions: { anthropic: { diff --git a/tests/ipcMain/forkWorkspace.test.ts b/tests/ipcMain/forkWorkspace.test.ts index 4ac28d3359..2d3948e2f7 100644 --- a/tests/ipcMain/forkWorkspace.test.ts +++ b/tests/ipcMain/forkWorkspace.test.ts @@ -13,6 +13,7 @@ import { createEventCollector, assertStreamSuccess, waitFor, + modelString, } from "./helpers"; import { detectDefaultTrunkBranch } from "../../src/git"; import { HistoryService } from "../../src/services/historyService"; @@ -100,8 +101,7 @@ describeIntegration("IpcMain fork workspace integration tests", () => { env.mockIpcRenderer, forkedWorkspaceId, "What is 2+2? Answer with just the number.", - "anthropic", - "claude-sonnet-4-5" + modelString("anthropic", "claude-sonnet-4-5") ); expect(sendResult.success).toBe(true); @@ -154,8 +154,7 @@ describeIntegration("IpcMain fork workspace integration tests", () => { env.mockIpcRenderer, forkedWorkspaceId, "What word did I ask you to remember? Reply with just the word.", - "anthropic", - "claude-sonnet-4-5" + modelString("anthropic", "claude-sonnet-4-5") ); expect(sendResult.success).toBe(true); @@ -206,15 +205,13 @@ describeIntegration("IpcMain fork workspace integration tests", () => { env.mockIpcRenderer, sourceWorkspaceId, "What is 5+5? Answer with just the number.", - "anthropic", - "claude-sonnet-4-5" + modelString("anthropic", "claude-sonnet-4-5") ), sendMessageWithModel( env.mockIpcRenderer, forkedWorkspaceId, "What is 3+3? Answer with just the number.", - "anthropic", - "claude-sonnet-4-5" + modelString("anthropic", "claude-sonnet-4-5") ), ]); @@ -253,8 +250,7 @@ describeIntegration("IpcMain fork workspace integration tests", () => { env.mockIpcRenderer, sourceWorkspaceId, "Count from 1 to 10, one number per line. Then say 'Done counting.'", - "anthropic", - "claude-sonnet-4-5" + modelString("anthropic", "claude-sonnet-4-5") ); // Wait for stream to start and produce some content @@ -286,8 +282,7 @@ describeIntegration("IpcMain fork workspace integration tests", () => { env.mockIpcRenderer, forkedWorkspaceId, "What is 7+3? Answer with just the number.", - "anthropic", - "claude-sonnet-4-5" + modelString("anthropic", "claude-sonnet-4-5") ); expect(forkedSendResult.success).toBe(true); diff --git a/tests/ipcMain/helpers.ts b/tests/ipcMain/helpers.ts index b7e9e2cac6..46596b428b 100644 --- a/tests/ipcMain/helpers.ts +++ b/tests/ipcMain/helpers.ts @@ -14,6 +14,7 @@ import * as os from "os"; import { detectDefaultTrunkBranch } from "../../src/git"; import type { TestEnvironment } from "./setup"; import type { RuntimeConfig } from "../../src/types/runtime"; +import { KNOWN_MODELS } from "../../src/constants/knownModels"; import type { ToolPolicy } from "../../src/utils/tools/toolPolicy"; // Test constants - centralized for consistency across all tests @@ -46,6 +47,13 @@ export function modelString(provider: string, model: string): string { /** * Send a message via IPC */ +type SendMessageWithModelOptions = Omit & { + imageParts?: Array<{ url: string; mediaType: string }>; +}; + +const DEFAULT_MODEL_ID = KNOWN_MODELS.SONNET.id; +const DEFAULT_PROVIDER = KNOWN_MODELS.SONNET.provider; + export async function sendMessage( mockIpcRenderer: IpcRenderer, workspaceId: string, @@ -61,19 +69,20 @@ export async function sendMessage( } /** - * Send a message with a provider and model (convenience wrapper) + * Send a message with an explicit model id (defaults to SONNET). */ export async function sendMessageWithModel( mockIpcRenderer: IpcRenderer, workspaceId: string, message: string, - provider = "anthropic", - model = "claude-sonnet-4-5", - options?: Omit + modelId: string = DEFAULT_MODEL_ID, + options?: SendMessageWithModelOptions ): Promise> { + const resolvedModel = modelId.includes(":") ? modelId : modelString(DEFAULT_PROVIDER, modelId); + return sendMessage(mockIpcRenderer, workspaceId, message, { ...options, - model: modelString(provider, model), + model: resolvedModel, }); } diff --git a/tests/ipcMain/modelNotFound.test.ts b/tests/ipcMain/modelNotFound.test.ts index 131a9c02aa..635c192b3f 100644 --- a/tests/ipcMain/modelNotFound.test.ts +++ b/tests/ipcMain/modelNotFound.test.ts @@ -1,5 +1,5 @@ import { setupWorkspace, shouldRunIntegrationTests, validateApiKeys } from "./setup"; -import { sendMessageWithModel, createEventCollector, waitFor } from "./helpers"; +import { sendMessageWithModel, createEventCollector, waitFor, modelString } from "./helpers"; import { IPC_CHANNELS } from "../../src/constants/ipc-constants"; import type { Result } from "../../src/types/result"; import type { SendMessageError } from "../../src/types/errors"; @@ -30,8 +30,7 @@ describeIntegration("IpcMain model_not_found error handling", () => { env.mockIpcRenderer, workspaceId, "Hello", - "anthropic", - "invalid-model-that-does-not-exist-xyz123" + modelString("anthropic", "invalid-model-that-does-not-exist-xyz123") ); // Collect events to verify error classification @@ -69,8 +68,7 @@ describeIntegration("IpcMain model_not_found error handling", () => { env.mockIpcRenderer, workspaceId, "Hello", - "openai", - "gpt-nonexistent-model-xyz123" + modelString("openai", "gpt-nonexistent-model-xyz123") ); // Collect events to verify error classification diff --git a/tests/ipcMain/ollama.test.ts b/tests/ipcMain/ollama.test.ts index 920c3e6f11..9bbd139b9f 100644 --- a/tests/ipcMain/ollama.test.ts +++ b/tests/ipcMain/ollama.test.ts @@ -4,6 +4,7 @@ import { createEventCollector, assertStreamSuccess, extractTextFromEvents, + modelString, } from "./helpers"; import { spawn } from "child_process"; @@ -105,8 +106,7 @@ describeOllama("IpcMain Ollama integration tests", () => { env.mockIpcRenderer, workspaceId, "Say 'hello' and nothing else", - "ollama", - OLLAMA_MODEL + modelString("ollama", OLLAMA_MODEL) ); // Verify the IPC call succeeded @@ -139,8 +139,7 @@ describeOllama("IpcMain Ollama integration tests", () => { env.mockIpcRenderer, workspaceId, "What is the current date and time? Use the bash tool to find out.", - "ollama", - OLLAMA_MODEL + modelString("ollama", OLLAMA_MODEL) ); expect(result.success).toBe(true); @@ -178,8 +177,7 @@ describeOllama("IpcMain Ollama integration tests", () => { env.mockIpcRenderer, workspaceId, "Read the README.md file and tell me what the first heading says.", - "ollama", - OLLAMA_MODEL + modelString("ollama", OLLAMA_MODEL) ); expect(result.success).toBe(true); @@ -216,8 +214,7 @@ describeOllama("IpcMain Ollama integration tests", () => { env.mockIpcRenderer, workspaceId, "This should fail", - "ollama", - OLLAMA_MODEL, + modelString("ollama", OLLAMA_MODEL), { providerOptions: { ollama: {}, diff --git a/tests/ipcMain/openai-web-search.test.ts b/tests/ipcMain/openai-web-search.test.ts index 5fd5e41748..76587c2b92 100644 --- a/tests/ipcMain/openai-web-search.test.ts +++ b/tests/ipcMain/openai-web-search.test.ts @@ -1,5 +1,10 @@ import { setupWorkspace, shouldRunIntegrationTests, validateApiKeys } from "./setup"; -import { sendMessageWithModel, createEventCollector, assertStreamSuccess } from "./helpers"; +import { + sendMessageWithModel, + createEventCollector, + assertStreamSuccess, + modelString, +} from "./helpers"; // Skip all tests if TEST_INTEGRATION is not set const describeIntegration = shouldRunIntegrationTests() ? describe : describe.skip; @@ -32,8 +37,7 @@ describeIntegration("OpenAI web_search integration tests", () => { workspaceId, "Use web search to find the current weather in San Francisco. " + "Then tell me if it's a good day for a picnic.", - "openai", - "gpt-5-codex", + modelString("openai", "gpt-5.1-codex-mini"), { thinkingLevel: "medium", // Ensure reasoning without excessive deliberation } diff --git a/tests/ipcMain/resumeStream.test.ts b/tests/ipcMain/resumeStream.test.ts index e71aeaacaf..713537b7a5 100644 --- a/tests/ipcMain/resumeStream.test.ts +++ b/tests/ipcMain/resumeStream.test.ts @@ -1,5 +1,5 @@ import { setupWorkspace, shouldRunIntegrationTests, validateApiKeys } from "./setup"; -import { sendMessageWithModel, createEventCollector, waitFor } from "./helpers"; +import { sendMessageWithModel, createEventCollector, waitFor, modelString } from "./helpers"; import { IPC_CHANNELS } from "../../src/constants/ipc-constants"; import type { Result } from "../../src/types/result"; import type { SendMessageError } from "../../src/types/errors"; @@ -31,8 +31,7 @@ describeIntegration("IpcMain resumeStream integration tests", () => { env.mockIpcRenderer, workspaceId, `Run this bash command: for i in 1 2 3; do sleep 0.5; done && echo '${expectedWord}'`, - "anthropic", - "claude-sonnet-4-5" + modelString("anthropic", "claude-sonnet-4-5") ); // Wait for stream to start diff --git a/tests/ipcMain/sendMessage.test.ts b/tests/ipcMain/sendMessage.test.ts index 4eca7c5775..e389d131dd 100644 --- a/tests/ipcMain/sendMessage.test.ts +++ b/tests/ipcMain/sendMessage.test.ts @@ -34,7 +34,7 @@ import { KNOWN_MODELS } from "@/constants/knownModels"; // Test both providers with their respective models const PROVIDER_CONFIGS: Array<[string, string]> = [ - ["openai", KNOWN_MODELS.GPT_CODEX.providerModelId], + ["openai", KNOWN_MODELS.GPT_MINI.providerModelId], ["anthropic", KNOWN_MODELS.SONNET.providerModelId], ]; @@ -63,8 +63,7 @@ describeIntegration("IpcMain sendMessage integration tests", () => { env.mockIpcRenderer, workspaceId, "Say 'hello' and nothing else", - provider, - model + modelString(provider, model) ); // Verify the IPC call succeeded @@ -95,7 +94,12 @@ describeIntegration("IpcMain sendMessage integration tests", () => { try { // Start a long-running stream with a bash command that takes time const longMessage = "Run this bash command: while true; do sleep 1; done"; - void sendMessageWithModel(env.mockIpcRenderer, workspaceId, longMessage, provider, model); + void sendMessageWithModel( + env.mockIpcRenderer, + workspaceId, + longMessage, + modelString(provider, model) + ); // Wait for stream to start const collector = createEventCollector(env.sentEvents, workspaceId); @@ -137,7 +141,12 @@ describeIntegration("IpcMain sendMessage integration tests", () => { // Ask the model to run a long-running bash command // Use explicit instruction to ensure tool call happens const message = "Use the bash tool to run: sleep 60"; - void sendMessageWithModel(env.mockIpcRenderer, workspaceId, message, provider, model); + void sendMessageWithModel( + env.mockIpcRenderer, + workspaceId, + message, + modelString(provider, model) + ); // Wait for stream to start (more reliable than waiting for tool-call-start) const collector = createEventCollector(env.sentEvents, workspaceId); @@ -195,8 +204,7 @@ describeIntegration("IpcMain sendMessage integration tests", () => { env.mockIpcRenderer, workspaceId, "Write a short paragraph about TypeScript", - provider, - model, + modelString(provider, model), { thinkingLevel: "off" } ); @@ -267,7 +275,12 @@ describeIntegration("IpcMain sendMessage integration tests", () => { try { // Start a stream that will generate some tokens const message = "Write a haiku about coding"; - void sendMessageWithModel(env.mockIpcRenderer, workspaceId, message, provider, model); + void sendMessageWithModel( + env.mockIpcRenderer, + workspaceId, + message, + modelString(provider, model) + ); // Wait for stream to start and get some deltas const collector = createEventCollector(env.sentEvents, workspaceId); @@ -331,8 +344,7 @@ describeIntegration("IpcMain sendMessage integration tests", () => { env.mockIpcRenderer, workspaceId, "Run this bash command: while true; do sleep 0.1; done", - provider, - model + modelString(provider, model) ); // Wait for tool-call-start (which means model is executing bash) @@ -426,8 +438,7 @@ describeIntegration("IpcMain sendMessage integration tests", () => { env.mockIpcRenderer, workspaceId, "", - provider, - model + modelString(provider, model) ); // Should fail - empty messages not allowed @@ -464,8 +475,7 @@ describeIntegration("IpcMain sendMessage integration tests", () => { env.mockIpcRenderer, workspaceId, "Say 'first message' and nothing else", - provider, - model + modelString(provider, model) ); expect(result1.success).toBe(true); @@ -485,8 +495,7 @@ describeIntegration("IpcMain sendMessage integration tests", () => { env.mockIpcRenderer, workspaceId, "Say 'edited message' and nothing else", - provider, - model, + modelString(provider, model), { editMessageId: (firstUserMessage as { id: string }).id } ); expect(result2.success).toBe(true); @@ -512,8 +521,7 @@ describeIntegration("IpcMain sendMessage integration tests", () => { env.mockIpcRenderer, workspaceId, "Run this bash command: for i in {1..20}; do sleep 0.5; done && echo done", - provider, - model + modelString(provider, model) ); expect(result1.success).toBe(true); @@ -531,8 +539,7 @@ describeIntegration("IpcMain sendMessage integration tests", () => { env.mockIpcRenderer, workspaceId, "Run this bash command: for i in {1..10}; do sleep 0.5; done && echo second", - provider, - model, + modelString(provider, model), { editMessageId: (firstUserMessage as { id: string }).id } ); expect(result2.success).toBe(true); @@ -552,8 +559,7 @@ describeIntegration("IpcMain sendMessage integration tests", () => { env.mockIpcRenderer, workspaceId, "Say 'third edit' and nothing else", - provider, - model, + modelString(provider, model), { editMessageId: (secondUserMessage as { id: string }).id } ); expect(result3.success).toBe(true); @@ -597,8 +603,7 @@ describeIntegration("IpcMain sendMessage integration tests", () => { env.mockIpcRenderer, workspaceId, "Read the file test-file.txt and tell me its contents verbatim. Do not add any extra text.", - provider, - model + modelString(provider, model) ); expect(result.success).toBe(true); @@ -635,8 +640,7 @@ describeIntegration("IpcMain sendMessage integration tests", () => { env.mockIpcRenderer, workspaceId, "Generate a random uncommon word and only say that word, nothing else.", - provider, - model + modelString(provider, model) ); expect(result1.success).toBe(true); @@ -670,8 +674,7 @@ describeIntegration("IpcMain sendMessage integration tests", () => { env.mockIpcRenderer, workspaceId, "What was the word you just said? Reply with only that word.", - provider, - model + modelString(provider, model) ); expect(result2.success).toBe(true); @@ -776,8 +779,7 @@ These are general instructions that apply to all modes. env.mockIpcRenderer, workspaceId, "Please respond.", - provider, - model, + modelString(provider, model), { mode: "plan" } ); expect(resultPlan.success).toBe(true); @@ -800,8 +802,7 @@ These are general instructions that apply to all modes. env.mockIpcRenderer, workspaceId, "Please respond.", - provider, - model, + modelString(provider, model), { mode: "exec" } ); expect(resultExec.success).toBe(true); @@ -849,8 +850,7 @@ These are general instructions that apply to all modes. env.mockIpcRenderer, workspaceId, "Say 'parity test' and nothing else", - provider, - model + modelString(provider, model) ); // Collect response @@ -891,8 +891,7 @@ These are general instructions that apply to all modes. env.mockIpcRenderer, workspaceId, "Hello", - provider, - model + modelString(provider, model) ); // Should fail with api_key_not_found error @@ -920,8 +919,7 @@ These are general instructions that apply to all modes. env.mockIpcRenderer, workspaceId, "Hello, world!", - provider, - nonExistentModel + modelString(provider, nonExistentModel) ); // IPC call should succeed (errors come through stream events) @@ -988,8 +986,7 @@ These are general instructions that apply to all modes. env.mockIpcRenderer, workspaceId, "What is the weather?", - provider, - model, + modelString(provider, model), sendOptions ); @@ -1108,8 +1105,7 @@ These are general instructions that apply to all modes. env.mockIpcRenderer, workspaceId, "Delete the file bash-test-file.txt using bash rm command", - provider, - model, + modelString(provider, model), { toolPolicy: [{ regex_match: "bash", action: "disable" }], ...(provider === "openai" @@ -1181,8 +1177,7 @@ These are general instructions that apply to all modes. env.mockIpcRenderer, workspaceId, "Edit the file edit-test-file.txt and replace 'original' with 'modified'", - provider, - model, + modelString(provider, model), { toolPolicy: [ { regex_match: "file_edit_.*", action: "disable" }, @@ -1301,8 +1296,7 @@ These are general instructions that apply to all modes. env.mockIpcRenderer, workspaceId, "This should trigger a context error", - provider, - model, + modelString(provider, model), { providerOptions: { openai: { @@ -1349,8 +1343,7 @@ These are general instructions that apply to all modes. env.mockIpcRenderer, workspaceId, "This should succeed with auto-truncation", - provider, - model + modelString(provider, model) // disableAutoTruncation defaults to false (auto-truncation enabled) ); @@ -1381,8 +1374,7 @@ These are general instructions that apply to all modes. env.mockIpcRenderer, workspaceId, `Open and replace 'line2' with 'LINE2' in ${path.basename(testFilePath)} using file_edit_replace, then confirm the change was successfully applied.`, - provider, - model + modelString(provider, model) ); expect(result1.success).toBe(true); @@ -1425,8 +1417,7 @@ These are general instructions that apply to all modes. env.mockIpcRenderer, workspaceId, "Confirm the previous edit was applied.", - provider, - model + modelString(provider, model) ); expect(result2.success).toBe(true); @@ -1589,7 +1580,7 @@ describe.each(PROVIDER_CONFIGS)("%s:%s image support", (provider, model) => { 40000 ); - // Test multi-turn conversation specifically for reasoning models (codex) + // Test multi-turn conversation specifically for reasoning models (codex mini) test.concurrent( "should handle multi-turn conversation with response ID persistence (openai reasoning models)", async () => { @@ -1600,8 +1591,7 @@ describe.each(PROVIDER_CONFIGS)("%s:%s image support", (provider, model) => { env.mockIpcRenderer, workspaceId, "What is 2+2?", - "openai", - KNOWN_MODELS.GPT_CODEX.providerModelId + modelString("openai", KNOWN_MODELS.GPT_MINI.providerModelId) ); expect(result1.success).toBe(true); @@ -1615,8 +1605,7 @@ describe.each(PROVIDER_CONFIGS)("%s:%s image support", (provider, model) => { env.mockIpcRenderer, workspaceId, "Now add 3 to that", - "openai", - KNOWN_MODELS.GPT_CODEX.providerModelId + modelString("openai", KNOWN_MODELS.GPT_MINI.providerModelId) ); expect(result2.success).toBe(true); diff --git a/tests/ipcMain/streamErrorRecovery.test.ts b/tests/ipcMain/streamErrorRecovery.test.ts index 658704ff51..ea7d193a17 100644 --- a/tests/ipcMain/streamErrorRecovery.test.ts +++ b/tests/ipcMain/streamErrorRecovery.test.ts @@ -17,7 +17,12 @@ */ import { setupWorkspace, shouldRunIntegrationTests, validateApiKeys } from "./setup"; -import { sendMessageWithModel, createEventCollector, readChatHistory } from "./helpers"; +import { + sendMessageWithModel, + createEventCollector, + readChatHistory, + modelString, +} from "./helpers"; import { IPC_CHANNELS } from "../../src/constants/ipc-constants"; // Skip all tests if TEST_INTEGRATION is not set @@ -246,8 +251,7 @@ IMPORTANT: Do not add any other text. Start immediately with ${nonce}-1: one. If env.mockIpcRenderer, workspaceId, prompt, - PROVIDER, - MODEL, + modelString(PROVIDER, MODEL), { toolPolicy: [{ regex_match: ".*", action: "disable" }] } ); expect(sendResult.success).toBe(true);