coder · ammario · Nov 25, 2025 · Nov 25, 2025
diff --git a/src/common/types/thinking.ts b/src/common/types/thinking.ts
@@ -14,19 +14,23 @@ export type ThinkingLevel = "off" | "low" | "medium" | "high";
 export type ThinkingLevelOn = Exclude<ThinkingLevel, "off">;
 
 /**
- * Anthropic thinking token budget mapping
+ * Anthropic effort level mapping
  *
- * These heuristics balance thinking depth with response time and cost:
- * - off: No extended thinking
- * - low: Quick thinking for straightforward tasks (4K tokens)
- * - medium: Standard thinking for moderate complexity (10K tokens)
- * - high: Deep thinking for complex problems (20K tokens)
+ * Maps our unified thinking levels to Anthropic's effort parameter:
+ * - off: No effort specified (undefined)
+ * - low: Most efficient - significant token savings
+ * - medium: Balanced approach with moderate token savings
+ * - high: Maximum capability (default behavior)
+ *
+ * The effort parameter controls all token spend including thinking,
+ * text responses, and tool calls. Unlike budget_tokens, it doesn't require
+ * thinking to be explicitly enabled.
  */
-export const ANTHROPIC_THINKING_BUDGETS: Record<ThinkingLevel, number> = {
-  off: 0,
-  low: 4000,
-  medium: 10000,
-  high: 20000,
+export const ANTHROPIC_EFFORT: Record<ThinkingLevel, "low" | "medium" | "high" | undefined> = {
+  off: undefined,
+  low: "low",
+  medium: "medium",
+  high: "high",
 };
 
 /**

diff --git a/src/common/utils/ai/providerOptions.ts b/src/common/utils/ai/providerOptions.ts
@@ -11,15 +11,14 @@ import type { XaiProviderOptions } from "@ai-sdk/xai";
 import type { MuxProviderOptions } from "@/common/types/providerOptions";
 import type { ThinkingLevel } from "@/common/types/thinking";
 import {
-  ANTHROPIC_THINKING_BUDGETS,
+  ANTHROPIC_EFFORT,
   GEMINI_THINKING_BUDGETS,
   OPENAI_REASONING_EFFORT,
   OPENROUTER_REASONING_EFFORT,
 } from "@/common/types/thinking";
 import { log } from "@/node/services/log";
 import type { MuxMessage } from "@/common/types/message";
 import { enforceThinkingPolicy } from "@/browser/utils/thinking/policy";
-import { getModelStats } from "@/common/utils/tokens/modelStats";
 
 /**
  * OpenRouter reasoning options
@@ -84,23 +83,19 @@ export function buildProviderOptions(
 
   // Build Anthropic-specific options
   if (provider === "anthropic") {
-    const budgetTokens = ANTHROPIC_THINKING_BUDGETS[effectiveThinking];
+    const effort = ANTHROPIC_EFFORT[effectiveThinking];
     log.debug("buildProviderOptions: Anthropic config", {
-      budgetTokens,
+      effort,
       thinkingLevel: effectiveThinking,
     });
 
     const options: ProviderOptions = {
       anthropic: {
         disableParallelToolUse: false, // Always enable concurrent tool execution
         sendReasoning: true, // Include reasoning traces in requests sent to the model
-        // Conditionally add thinking configuration
-        ...(budgetTokens > 0 && {
-          thinking: {
-            type: "enabled",
-            budgetTokens,
-          },
-        }),
+        // Use effort parameter to control token spend (thinking, text, and tool calls)
+        // SDK auto-adds beta header "effort-2025-11-24" when effort is set
+        ...(effort && { effort }),
       },
     };
     log.debug("buildProviderOptions: Returning Anthropic options", options);
@@ -278,77 +273,3 @@ export function buildProviderOptions(
   log.debug("buildProviderOptions: Unsupported provider", provider);
   return {};
 }
-
-/**
- * Calculate the effective maxOutputTokens for a model based on its limits and thinking budget
- *
- * For Anthropic models with extended thinking, the AI SDK adds thinkingBudget to maxOutputTokens
- * internally. We need to ensure the sum doesn't exceed the model's max_output_tokens limit.
- *
- * For example, Claude Opus 4 has max_output_tokens=32000. If we use:
- * - thinkingBudget=20000 (high)
- * - maxOutputTokens=32000
- * Then total=52000 which exceeds 32000 → SDK shows warning and caps output
- *
- * Solution: Reduce maxOutputTokens so that maxOutputTokens + thinkingBudget <= model limit
- *
- * @param modelString - Full model string (e.g., "anthropic:claude-opus-4-1")
- * @param thinkingLevel - Current thinking level
- * @param requestedMaxOutputTokens - Optional user-requested maxOutputTokens
- * @returns Effective maxOutputTokens that respects model limits with thinking budget
- */
-export function calculateEffectiveMaxOutputTokens(
-  modelString: string,
-  thinkingLevel: ThinkingLevel,
-  requestedMaxOutputTokens?: number
-): number | undefined {
-  const [provider] = modelString.split(":");
-
-  // Only apply this adjustment for Anthropic models
-  if (provider !== "anthropic") {
-    return requestedMaxOutputTokens;
-  }
-
-  // Get the actual thinking level after policy enforcement
-  const effectiveThinking = enforceThinkingPolicy(modelString, thinkingLevel);
-  const thinkingBudget = ANTHROPIC_THINKING_BUDGETS[effectiveThinking];
-
-  // Get model's max output tokens from models.json
-  const modelStats = getModelStats(modelString);
-  const modelMaxOutput = modelStats?.max_output_tokens;
-
-  // If we don't know the model's max output, return requested value
-  if (!modelMaxOutput) {
-    log.debug("calculateEffectiveMaxOutputTokens: Unknown model max output, using requested", {
-      modelString,
-      requestedMaxOutputTokens,
-    });
-    return requestedMaxOutputTokens;
-  }
-
-  // Calculate the maximum safe maxOutputTokens
-  // The SDK will add thinkingBudget to maxOutputTokens, so we need room for both
-  const maxSafeOutput = modelMaxOutput - thinkingBudget;
-
-  // If user didn't request specific tokens, use the max safe value
-  const targetOutput = requestedMaxOutputTokens ?? modelMaxOutput;
-
-  // Cap at the safe maximum
-  const effectiveOutput = Math.min(targetOutput, maxSafeOutput);
-
-  // Ensure we don't go below a reasonable minimum (1000 tokens)
-  const finalOutput = Math.max(effectiveOutput, 1000);
-
-  log.debug("calculateEffectiveMaxOutputTokens", {
-    modelString,
-    thinkingLevel,
-    effectiveThinking,
-    thinkingBudget,
-    modelMaxOutput,
-    requestedMaxOutputTokens,
-    maxSafeOutput,
-    finalOutput,
-  });
-
-  return finalOutput;
-}
diff --git a/src/node/services/aiService.ts b/src/node/services/aiService.ts
@@ -33,10 +33,7 @@ import type { HistoryService } from "./historyService";
 import type { PartialService } from "./partialService";
 import { buildSystemMessage, readToolInstructions } from "./systemMessage";
 import { getTokenizerForModel } from "@/node/utils/main/tokenizer";
-import {
-  buildProviderOptions,
-  calculateEffectiveMaxOutputTokens,
-} from "@/common/utils/ai/providerOptions";
+import { buildProviderOptions } from "@/common/utils/ai/providerOptions";
 import type { ThinkingLevel } from "@/common/types/thinking";
 import type {
   StreamAbortEvent,
@@ -929,15 +926,6 @@ export class AIService extends EventEmitter {
         effectiveMuxProviderOptions
       );
 
-      // Calculate effective maxOutputTokens that accounts for thinking budget
-      // For Anthropic models with extended thinking, the SDK adds thinkingBudget to maxOutputTokens
-      // so we need to ensure the sum doesn't exceed the model's max_output_tokens limit
-      const effectiveMaxOutputTokens = calculateEffectiveMaxOutputTokens(
-        effectiveModelString,
-        thinkingLevel ?? "off",
-        maxOutputTokens
-      );
-
       // Delegate to StreamManager with model instance, system message, tools, historySequence, and initial metadata
       const streamResult = await this.streamManager.startStream(
         workspaceId,
@@ -955,7 +943,7 @@ export class AIService extends EventEmitter {
           mode, // Pass mode so it persists in final history entry
         },
         providerOptions,
-        effectiveMaxOutputTokens,
+        maxOutputTokens,
         toolPolicy,
         streamToken // Pass the pre-generated stream token
       );