diff --git a/src/common/types/thinking.ts b/src/common/types/thinking.ts index 8d71620027..66cd536a6f 100644 --- a/src/common/types/thinking.ts +++ b/src/common/types/thinking.ts @@ -14,19 +14,23 @@ export type ThinkingLevel = "off" | "low" | "medium" | "high"; export type ThinkingLevelOn = Exclude; /** - * Anthropic thinking token budget mapping + * Anthropic effort level mapping * - * These heuristics balance thinking depth with response time and cost: - * - off: No extended thinking - * - low: Quick thinking for straightforward tasks (4K tokens) - * - medium: Standard thinking for moderate complexity (10K tokens) - * - high: Deep thinking for complex problems (20K tokens) + * Maps our unified thinking levels to Anthropic's effort parameter: + * - off: No effort specified (undefined) + * - low: Most efficient - significant token savings + * - medium: Balanced approach with moderate token savings + * - high: Maximum capability (default behavior) + * + * The effort parameter controls all token spend including thinking, + * text responses, and tool calls. Unlike budget_tokens, it doesn't require + * thinking to be explicitly enabled. */ -export const ANTHROPIC_THINKING_BUDGETS: Record = { - off: 0, - low: 4000, - medium: 10000, - high: 20000, +export const ANTHROPIC_EFFORT: Record = { + off: undefined, + low: "low", + medium: "medium", + high: "high", }; /** diff --git a/src/common/utils/ai/providerOptions.ts b/src/common/utils/ai/providerOptions.ts index d343c49499..93b33cb068 100644 --- a/src/common/utils/ai/providerOptions.ts +++ b/src/common/utils/ai/providerOptions.ts @@ -11,7 +11,7 @@ import type { XaiProviderOptions } from "@ai-sdk/xai"; import type { MuxProviderOptions } from "@/common/types/providerOptions"; import type { ThinkingLevel } from "@/common/types/thinking"; import { - ANTHROPIC_THINKING_BUDGETS, + ANTHROPIC_EFFORT, GEMINI_THINKING_BUDGETS, OPENAI_REASONING_EFFORT, OPENROUTER_REASONING_EFFORT, @@ -19,7 +19,6 @@ import { import { log } from "@/node/services/log"; import type { MuxMessage } from "@/common/types/message"; import { enforceThinkingPolicy } from "@/browser/utils/thinking/policy"; -import { getModelStats } from "@/common/utils/tokens/modelStats"; /** * OpenRouter reasoning options @@ -84,9 +83,9 @@ export function buildProviderOptions( // Build Anthropic-specific options if (provider === "anthropic") { - const budgetTokens = ANTHROPIC_THINKING_BUDGETS[effectiveThinking]; + const effort = ANTHROPIC_EFFORT[effectiveThinking]; log.debug("buildProviderOptions: Anthropic config", { - budgetTokens, + effort, thinkingLevel: effectiveThinking, }); @@ -94,13 +93,9 @@ export function buildProviderOptions( anthropic: { disableParallelToolUse: false, // Always enable concurrent tool execution sendReasoning: true, // Include reasoning traces in requests sent to the model - // Conditionally add thinking configuration - ...(budgetTokens > 0 && { - thinking: { - type: "enabled", - budgetTokens, - }, - }), + // Use effort parameter to control token spend (thinking, text, and tool calls) + // SDK auto-adds beta header "effort-2025-11-24" when effort is set + ...(effort && { effort }), }, }; log.debug("buildProviderOptions: Returning Anthropic options", options); @@ -278,77 +273,3 @@ export function buildProviderOptions( log.debug("buildProviderOptions: Unsupported provider", provider); return {}; } - -/** - * Calculate the effective maxOutputTokens for a model based on its limits and thinking budget - * - * For Anthropic models with extended thinking, the AI SDK adds thinkingBudget to maxOutputTokens - * internally. We need to ensure the sum doesn't exceed the model's max_output_tokens limit. - * - * For example, Claude Opus 4 has max_output_tokens=32000. If we use: - * - thinkingBudget=20000 (high) - * - maxOutputTokens=32000 - * Then total=52000 which exceeds 32000 → SDK shows warning and caps output - * - * Solution: Reduce maxOutputTokens so that maxOutputTokens + thinkingBudget <= model limit - * - * @param modelString - Full model string (e.g., "anthropic:claude-opus-4-1") - * @param thinkingLevel - Current thinking level - * @param requestedMaxOutputTokens - Optional user-requested maxOutputTokens - * @returns Effective maxOutputTokens that respects model limits with thinking budget - */ -export function calculateEffectiveMaxOutputTokens( - modelString: string, - thinkingLevel: ThinkingLevel, - requestedMaxOutputTokens?: number -): number | undefined { - const [provider] = modelString.split(":"); - - // Only apply this adjustment for Anthropic models - if (provider !== "anthropic") { - return requestedMaxOutputTokens; - } - - // Get the actual thinking level after policy enforcement - const effectiveThinking = enforceThinkingPolicy(modelString, thinkingLevel); - const thinkingBudget = ANTHROPIC_THINKING_BUDGETS[effectiveThinking]; - - // Get model's max output tokens from models.json - const modelStats = getModelStats(modelString); - const modelMaxOutput = modelStats?.max_output_tokens; - - // If we don't know the model's max output, return requested value - if (!modelMaxOutput) { - log.debug("calculateEffectiveMaxOutputTokens: Unknown model max output, using requested", { - modelString, - requestedMaxOutputTokens, - }); - return requestedMaxOutputTokens; - } - - // Calculate the maximum safe maxOutputTokens - // The SDK will add thinkingBudget to maxOutputTokens, so we need room for both - const maxSafeOutput = modelMaxOutput - thinkingBudget; - - // If user didn't request specific tokens, use the max safe value - const targetOutput = requestedMaxOutputTokens ?? modelMaxOutput; - - // Cap at the safe maximum - const effectiveOutput = Math.min(targetOutput, maxSafeOutput); - - // Ensure we don't go below a reasonable minimum (1000 tokens) - const finalOutput = Math.max(effectiveOutput, 1000); - - log.debug("calculateEffectiveMaxOutputTokens", { - modelString, - thinkingLevel, - effectiveThinking, - thinkingBudget, - modelMaxOutput, - requestedMaxOutputTokens, - maxSafeOutput, - finalOutput, - }); - - return finalOutput; -} diff --git a/src/node/services/aiService.ts b/src/node/services/aiService.ts index 81222beb1e..6cb748f913 100644 --- a/src/node/services/aiService.ts +++ b/src/node/services/aiService.ts @@ -33,10 +33,7 @@ import type { HistoryService } from "./historyService"; import type { PartialService } from "./partialService"; import { buildSystemMessage, readToolInstructions } from "./systemMessage"; import { getTokenizerForModel } from "@/node/utils/main/tokenizer"; -import { - buildProviderOptions, - calculateEffectiveMaxOutputTokens, -} from "@/common/utils/ai/providerOptions"; +import { buildProviderOptions } from "@/common/utils/ai/providerOptions"; import type { ThinkingLevel } from "@/common/types/thinking"; import type { StreamAbortEvent, @@ -929,15 +926,6 @@ export class AIService extends EventEmitter { effectiveMuxProviderOptions ); - // Calculate effective maxOutputTokens that accounts for thinking budget - // For Anthropic models with extended thinking, the SDK adds thinkingBudget to maxOutputTokens - // so we need to ensure the sum doesn't exceed the model's max_output_tokens limit - const effectiveMaxOutputTokens = calculateEffectiveMaxOutputTokens( - effectiveModelString, - thinkingLevel ?? "off", - maxOutputTokens - ); - // Delegate to StreamManager with model instance, system message, tools, historySequence, and initial metadata const streamResult = await this.streamManager.startStream( workspaceId, @@ -955,7 +943,7 @@ export class AIService extends EventEmitter { mode, // Pass mode so it persists in final history entry }, providerOptions, - effectiveMaxOutputTokens, + maxOutputTokens, toolPolicy, streamToken // Pass the pre-generated stream token );