diff --git a/packages/types/src/provider-settings.ts b/packages/types/src/provider-settings.ts
index 3b3717ad45..cc74f9f43d 100644
--- a/packages/types/src/provider-settings.ts
+++ b/packages/types/src/provider-settings.ts
@@ -250,6 +250,7 @@ const openAiSchema = baseProviderSettingsSchema.extend({
openAiStreamingEnabled: z.boolean().optional(),
openAiHostHeader: z.string().optional(), // Keep temporarily for backward compatibility during migration.
openAiHeaders: z.record(z.string(), z.string()).optional(),
+ openAiStrictToolMessageOrdering: z.boolean().optional(), // For providers that don't allow user messages after tool messages.
})
const ollamaSchema = baseProviderSettingsSchema.extend({
diff --git a/src/api/providers/openai.ts b/src/api/providers/openai.ts
index 43a4fe7ae3..b6b5147739 100644
--- a/src/api/providers/openai.ts
+++ b/src/api/providers/openai.ts
@@ -102,11 +102,18 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
content: systemPrompt,
}
+ // When strict tool message ordering is enabled, merge text content after tool_results
+ // into the last tool message instead of creating a separate user message.
+ // This is required for providers like NVIDIA NIM that don't allow user messages after tool messages.
+ const strictToolMessageOrdering = this.options.openAiStrictToolMessageOrdering ?? false
+
if (this.options.openAiStreamingEnabled ?? true) {
let convertedMessages
if (deepseekReasoner) {
- convertedMessages = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages])
+ convertedMessages = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages], {
+ mergeToolResultText: strictToolMessageOrdering,
+ })
} else {
if (modelInfo.supportsPromptCache) {
systemMessage = {
@@ -122,7 +129,10 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
}
}
- convertedMessages = [systemMessage, ...convertToOpenAiMessages(messages)]
+ convertedMessages = [
+ systemMessage,
+ ...convertToOpenAiMessages(messages, { mergeToolResultText: strictToolMessageOrdering }),
+ ]
if (modelInfo.supportsPromptCache) {
// Note: the following logic is copied from openrouter:
@@ -227,8 +237,13 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming = {
model: modelId,
messages: deepseekReasoner
- ? convertToR1Format([{ role: "user", content: systemPrompt }, ...messages])
- : [systemMessage, ...convertToOpenAiMessages(messages)],
+ ? convertToR1Format([{ role: "user", content: systemPrompt }, ...messages], {
+ mergeToolResultText: strictToolMessageOrdering,
+ })
+ : [
+ systemMessage,
+ ...convertToOpenAiMessages(messages, { mergeToolResultText: strictToolMessageOrdering }),
+ ],
...(metadata?.tools && { tools: this.convertToolsForOpenAI(metadata.tools) }),
...(metadata?.tool_choice && { tool_choice: metadata.tool_choice }),
...(metadata?.toolProtocol === "native" && {
@@ -338,6 +353,7 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
): ApiStream {
const modelInfo = this.getModel().info
const methodIsAzureAiInference = this._isAzureAiInference(this.options.openAiBaseUrl)
+ const strictToolMessageOrdering = this.options.openAiStrictToolMessageOrdering ?? false
if (this.options.openAiStreamingEnabled ?? true) {
const isGrokXAI = this._isGrokXAI(this.options.openAiBaseUrl)
@@ -349,7 +365,7 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
role: "developer",
content: `Formatting re-enabled\n${systemPrompt}`,
},
- ...convertToOpenAiMessages(messages),
+ ...convertToOpenAiMessages(messages, { mergeToolResultText: strictToolMessageOrdering }),
],
stream: true,
...(isGrokXAI ? {} : { stream_options: { include_usage: true } }),
@@ -386,7 +402,7 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
role: "developer",
content: `Formatting re-enabled\n${systemPrompt}`,
},
- ...convertToOpenAiMessages(messages),
+ ...convertToOpenAiMessages(messages, { mergeToolResultText: strictToolMessageOrdering }),
],
reasoning_effort: modelInfo.reasoningEffort as "low" | "medium" | "high" | undefined,
temperature: undefined,
diff --git a/src/api/transform/__tests__/openai-format.spec.ts b/src/api/transform/__tests__/openai-format.spec.ts
index d5d4404837..33996763ef 100644
--- a/src/api/transform/__tests__/openai-format.spec.ts
+++ b/src/api/transform/__tests__/openai-format.spec.ts
@@ -319,7 +319,7 @@ describe("convertToOpenAiMessages", () => {
)
})
- it("should NOT merge text when images are present (fall back to user message)", () => {
+ it("should merge text and send images separately when mergeToolResultText is true", () => {
const anthropicMessages: Anthropic.Messages.MessageParam[] = [
{
role: "user",
@@ -329,6 +329,10 @@ describe("convertToOpenAiMessages", () => {
tool_use_id: "tool-123",
content: "Tool result content",
},
+ {
+ type: "text",
+ text: "Context info",
+ },
{
type: "image",
source: {
@@ -343,9 +347,57 @@ describe("convertToOpenAiMessages", () => {
const openAiMessages = convertToOpenAiMessages(anthropicMessages, { mergeToolResultText: true })
- // Should produce a tool message AND a user message (because image is present)
+ // Should produce a tool message with merged text AND a user message for the image
+ expect(openAiMessages).toHaveLength(2)
+
+ // First message: tool message with merged text
+ const toolMessage = openAiMessages[0] as OpenAI.Chat.ChatCompletionToolMessageParam
+ expect(toolMessage.role).toBe("tool")
+ expect(toolMessage.tool_call_id).toBe("tool-123")
+ expect(toolMessage.content).toBe(
+ "Tool result content\n\nContext info",
+ )
+
+ // Second message: user message with only the image
+ expect(openAiMessages[1].role).toBe("user")
+ const userContent = openAiMessages[1].content as Array<{ type: string; image_url?: { url: string } }>
+ expect(Array.isArray(userContent)).toBe(true)
+ expect(userContent).toHaveLength(1)
+ expect(userContent[0].type).toBe("image_url")
+ expect(userContent[0].image_url?.url).toBe("data:image/png;base64,base64data")
+ })
+
+ it("should send only images as user message when no text content exists with mergeToolResultText", () => {
+ const anthropicMessages: Anthropic.Messages.MessageParam[] = [
+ {
+ role: "user",
+ content: [
+ {
+ type: "tool_result",
+ tool_use_id: "tool-123",
+ content: "Tool result content",
+ },
+ {
+ type: "image",
+ source: {
+ type: "base64",
+ media_type: "image/png",
+ data: "base64data",
+ },
+ },
+ ],
+ },
+ ]
+
+ const openAiMessages = convertToOpenAiMessages(anthropicMessages, { mergeToolResultText: true })
+
+ // Should produce a tool message AND a user message (only image, no text to merge)
expect(openAiMessages).toHaveLength(2)
expect((openAiMessages[0] as OpenAI.Chat.ChatCompletionToolMessageParam).role).toBe("tool")
+ // Tool message content should NOT be modified since there's no text to merge
+ expect((openAiMessages[0] as OpenAI.Chat.ChatCompletionToolMessageParam).content).toBe(
+ "Tool result content",
+ )
expect(openAiMessages[1].role).toBe("user")
})
@@ -430,6 +482,122 @@ describe("convertToOpenAiMessages", () => {
expect(openAiMessages).toHaveLength(1)
expect(openAiMessages[0].role).toBe("user")
})
+
+ it("should merge text into tool messages for multiple tool calls across conversation turns", () => {
+ // This test simulates a full conversation with multiple tool_result + environment_details messages
+ // to ensure mergeToolResultText works correctly for ALL tool_result messages, not just the first one
+ // Regression test for: "The fix works for the first message but after the first response the text content is NOT merged"
+ const anthropicMessages: Anthropic.Messages.MessageParam[] = [
+ // Initial user message (no tool_results)
+ {
+ role: "user",
+ content: [
+ { type: "text", text: "Create a file for me" },
+ { type: "text", text: "Context 1" },
+ ],
+ },
+ // Assistant uses first tool
+ {
+ role: "assistant",
+ content: [
+ { type: "text", text: "I'll create the file for you." },
+ {
+ type: "tool_use",
+ id: "call_1",
+ name: "write_file",
+ input: { path: "test.txt", content: "hello" },
+ },
+ ],
+ },
+ // First tool result + environment_details
+ {
+ role: "user",
+ content: [
+ { type: "tool_result", tool_use_id: "call_1", content: "File created successfully" },
+ { type: "text", text: "Context 2" },
+ ],
+ },
+ // Assistant uses second tool
+ {
+ role: "assistant",
+ content: [
+ { type: "text", text: "Now I'll read the file to verify." },
+ { type: "tool_use", id: "call_2", name: "read_file", input: { path: "test.txt" } },
+ ],
+ },
+ // Second tool result + environment_details (this is where the bug was reported)
+ {
+ role: "user",
+ content: [
+ { type: "tool_result", tool_use_id: "call_2", content: "File content: hello" },
+ { type: "text", text: "Context 3" },
+ ],
+ },
+ ]
+
+ const openAiMessages = convertToOpenAiMessages(anthropicMessages, { mergeToolResultText: true })
+
+ // Expected structure:
+ // 1. User message (initial, no tool_results - text should remain as user message)
+ // 2. Assistant message with tool_calls
+ // 3. Tool message with merged text (first tool_result)
+ // 4. Assistant message with tool_calls
+ // 5. Tool message with merged text (second tool_result)
+ expect(openAiMessages).toHaveLength(5)
+
+ // First message should be a user message (no tool_results to merge into)
+ expect(openAiMessages[0].role).toBe("user")
+
+ // Second message should be assistant with tool_calls
+ expect(openAiMessages[1].role).toBe("assistant")
+ expect((openAiMessages[1] as OpenAI.Chat.ChatCompletionAssistantMessageParam).tool_calls).toHaveLength(1)
+
+ // Third message should be tool message with merged environment_details
+ const firstToolMsg = openAiMessages[2] as OpenAI.Chat.ChatCompletionToolMessageParam
+ expect(firstToolMsg.role).toBe("tool")
+ expect(firstToolMsg.tool_call_id).toBe("call_1")
+ expect(firstToolMsg.content).toContain("File created successfully")
+ expect(firstToolMsg.content).toContain("Context 2")
+
+ // Fourth message should be assistant with tool_calls
+ expect(openAiMessages[3].role).toBe("assistant")
+ expect((openAiMessages[3] as OpenAI.Chat.ChatCompletionAssistantMessageParam).tool_calls).toHaveLength(1)
+
+ // Fifth message should be tool message with merged environment_details (THE BUG FIX)
+ const secondToolMsg = openAiMessages[4] as OpenAI.Chat.ChatCompletionToolMessageParam
+ expect(secondToolMsg.role).toBe("tool")
+ expect(secondToolMsg.tool_call_id).toBe("call_2")
+ expect(secondToolMsg.content).toContain("File content: hello")
+ expect(secondToolMsg.content).toContain("Context 3")
+ })
+
+ it("should NOT create user messages after tool messages when mergeToolResultText is true", () => {
+ // This test specifically verifies that the "user after tool" error is avoided
+ const anthropicMessages: Anthropic.Messages.MessageParam[] = [
+ {
+ role: "assistant",
+ content: [{ type: "tool_use", id: "tool_1", name: "read_file", input: { path: "test.ts" } }],
+ },
+ {
+ role: "user",
+ content: [
+ { type: "tool_result", tool_use_id: "tool_1", content: "File contents" },
+ { type: "text", text: "Some context" },
+ ],
+ },
+ ]
+
+ const openAiMessages = convertToOpenAiMessages(anthropicMessages, { mergeToolResultText: true })
+
+ // Should produce assistant + tool (no user message)
+ expect(openAiMessages).toHaveLength(2)
+ expect(openAiMessages[0].role).toBe("assistant")
+ expect(openAiMessages[1].role).toBe("tool")
+ // The text should be merged into the tool message, NOT as a separate user message
+ expect((openAiMessages[1] as OpenAI.Chat.ChatCompletionToolMessageParam).content).toContain(
+ "Some context",
+ )
+ })
})
describe("reasoning_details transformation", () => {
diff --git a/src/api/transform/openai-format.ts b/src/api/transform/openai-format.ts
index ad02be5541..f182a2aac1 100644
--- a/src/api/transform/openai-format.ts
+++ b/src/api/transform/openai-format.ts
@@ -138,24 +138,40 @@ export function convertToOpenAiMessages(
// Process non-tool messages
if (nonToolMessages.length > 0) {
- // Check if we should merge text into the last tool message
- // This is critical for reasoning/thinking models where a user message
- // after tool results causes the model to drop all previous reasoning_content
- const hasOnlyTextContent = nonToolMessages.every((part) => part.type === "text")
const hasToolMessages = toolMessages.length > 0
- const shouldMergeIntoToolMessage =
- options?.mergeToolResultText && hasToolMessages && hasOnlyTextContent
- if (shouldMergeIntoToolMessage) {
+ if (options?.mergeToolResultText && hasToolMessages) {
+ // When mergeToolResultText is enabled, separate text and images
+ // Merge text into the last tool message, and send images separately
+ // This is critical for providers like NVIDIA NIM that don't allow user messages after tool messages
+ const textMessages = nonToolMessages.filter(
+ (part) => part.type === "text",
+ ) as Anthropic.TextBlockParam[]
+ const imageMessages = nonToolMessages.filter(
+ (part) => part.type === "image",
+ ) as Anthropic.ImageBlockParam[]
+
// Merge text content into the last tool message
- const lastToolMessage = openAiMessages[
- openAiMessages.length - 1
- ] as OpenAI.Chat.ChatCompletionToolMessageParam
- if (lastToolMessage?.role === "tool") {
- const additionalText = nonToolMessages
- .map((part) => (part as Anthropic.TextBlockParam).text)
- .join("\n")
- lastToolMessage.content = `${lastToolMessage.content}\n\n${additionalText}`
+ if (textMessages.length > 0) {
+ const lastToolMessage = openAiMessages[
+ openAiMessages.length - 1
+ ] as OpenAI.Chat.ChatCompletionToolMessageParam
+ if (lastToolMessage?.role === "tool") {
+ const additionalText = textMessages.map((part) => part.text).join("\n")
+ lastToolMessage.content = `${lastToolMessage.content}\n\n${additionalText}`
+ }
+ }
+
+ // Send images as a separate user message if any
+ // Note: Images must still be sent as user messages since tool messages don't support images
+ if (imageMessages.length > 0) {
+ openAiMessages.push({
+ role: "user",
+ content: imageMessages.map((part) => ({
+ type: "image_url",
+ image_url: { url: `data:${part.source.media_type};base64,${part.source.data}` },
+ })),
+ })
}
} else {
// Standard behavior: add user message with text/image content
diff --git a/webview-ui/src/components/settings/providers/OpenAICompatible.tsx b/webview-ui/src/components/settings/providers/OpenAICompatible.tsx
index 4eea6f09f1..cc1777ca76 100644
--- a/webview-ui/src/components/settings/providers/OpenAICompatible.tsx
+++ b/webview-ui/src/components/settings/providers/OpenAICompatible.tsx
@@ -158,6 +158,16 @@ export const OpenAICompatible = ({
onChange={handleInputChange("openAiStreamingEnabled", noTransform)}>
{t("settings:modelInfo.enableStreaming")}
+
+
+ {t("settings:providers.openAiStrictToolMessageOrdering.label")}
+
+
+ {t("settings:providers.openAiStrictToolMessageOrdering.description")}
+
+