🤖 fix: exclude output/reasoning tokens from context window percentage

ammar-agent · ammar-agent · commit 73573c10bf2c · 2025-12-03T14:23:20.000-06:00
Context window percentage was incorrectly including output and reasoning
tokens, which don't count against the model's input context limit. This
caused inflated percentages when models returned large outputs or used
extended thinking.

For example, with 150k input + 150k reasoning tokens on a 200k model:
- Before: (150k + 150k) / 200k = 150% (incorrect)
- After: 150k / 200k = 75% (correct)

Also fixes the threshold slider blocking token meter tooltip - the slider
now only captures mouse events in a small zone around the indicator,
allowing the tooltip to show when hovering elsewhere on the bar.

Changes:
- tokenMeterUtils.ts: Calculate contextUsed separately from totalUsed
- CostsTab.tsx: Use contextUsed for percentage calculation
- autoCompactionCheck.ts: Rename getTotalTokens to getContextTokens
- ThresholdSlider.tsx: Use pointer-events to not block tooltip
- Updated tests to reflect correct behavior

---

_Generated with `mux`_
diff --git a/src/browser/components/RightSidebar/CostsTab.tsx b/src/browser/components/RightSidebar/CostsTab.tsx
@@ -126,13 +126,17 @@ const CostsTabComponent: React.FC<CostsTabProps> = ({ workspaceId }) => {
               const is1MActive = use1M && supports1MContext(model);
               const maxTokens = is1MActive ? 1_000_000 : baseMaxTokens;
 
-              // Total tokens includes cache creation (they're input tokens sent for caching)
-              const totalUsed = contextUsage
+              // Context window only includes input-side tokens (what's sent to the model)
+              // Output and reasoning tokens don't count against context window limits
+              const contextUsed = contextUsage
                 ? contextUsage.input.tokens +
                   contextUsage.cached.tokens +
-                  contextUsage.cacheCreate.tokens +
-                  contextUsage.output.tokens +
-                  contextUsage.reasoning.tokens
+                  contextUsage.cacheCreate.tokens
+                : 0;
+
+              // Total tokens across all categories (for proportional display)
+              const totalUsed = contextUsage
+                ? contextUsed + contextUsage.output.tokens + contextUsage.reasoning.tokens
                 : 0;
 
               // Calculate percentages based on max tokens (actual context window usage)
@@ -151,7 +155,8 @@ const CostsTabComponent: React.FC<CostsTabProps> = ({ workspaceId }) => {
                 cachedPercentage = (contextUsage.cached.tokens / maxTokens) * 100;
                 cacheCreatePercentage = (contextUsage.cacheCreate.tokens / maxTokens) * 100;
                 reasoningPercentage = (contextUsage.reasoning.tokens / maxTokens) * 100;
-                totalPercentage = (totalUsed / maxTokens) * 100;
+                // Use contextUsed for percentage (excludes output/reasoning from context limit)
+                totalPercentage = (contextUsed / maxTokens) * 100;
               } else if (contextUsage) {
                 // Unknown model - scale to total tokens used
                 inputPercentage = totalUsed > 0 ? (contextUsage.input.tokens / totalUsed) * 100 : 0;
diff --git a/src/browser/components/RightSidebar/ThresholdSlider.tsx b/src/browser/components/RightSidebar/ThresholdSlider.tsx
@@ -174,15 +174,38 @@ export const ThresholdSlider: React.FC<ThresholdSliderProps> = ({ config, orient
   const color = isEnabled ? "var(--color-plan-mode)" : "var(--color-muted)";
   const tooltipText = getTooltipText(config.threshold, orientation);
 
-  // Container styles
+  // Container styles - covers the full bar area for drag handling
+  // Uses pointer-events: none by default, only the indicator handle has pointer-events: auto
+  // This allows the token meter tooltip to work when hovering elsewhere on the bar
   const containerStyle: React.CSSProperties = {
     position: "absolute",
-    cursor: isHorizontal ? "ew-resize" : "ns-resize",
     top: 0,
     bottom: 0,
     left: 0,
     right: 0,
     zIndex: 50,
+    pointerEvents: "none", // Let events pass through to tooltip beneath
+  };
+
+  // Drag handle around the indicator - this captures mouse events
+  const DRAG_ZONE_SIZE = 16; // pixels on each side of the indicator
+  const handleStyle: React.CSSProperties = {
+    position: "absolute",
+    cursor: isHorizontal ? "ew-resize" : "ns-resize",
+    pointerEvents: "auto", // Only this element captures events
+    ...(isHorizontal
+      ? {
+          left: `calc(${config.threshold}% - ${DRAG_ZONE_SIZE}px)`,
+          width: DRAG_ZONE_SIZE * 2,
+          top: 0,
+          bottom: 0,
+        }
+      : {
+          top: `calc(${config.threshold}% - ${DRAG_ZONE_SIZE}px)`,
+          height: DRAG_ZONE_SIZE * 2,
+          left: 0,
+          right: 0,
+        }),
   };
 
   // Indicator positioning - use transform for centering on both axes
@@ -215,15 +238,17 @@ export const ThresholdSlider: React.FC<ThresholdSliderProps> = ({ config, orient
   const containerRect = containerRef.current?.getBoundingClientRect();
 
   return (
-    <div
-      ref={containerRef}
-      style={containerStyle}
-      onMouseDown={handleMouseDown}
-      onMouseEnter={() => setIsHovered(true)}
-      onMouseLeave={() => setIsHovered(false)}
-      // Horizontal uses native title (simpler, no clipping issues with wide tooltips)
-      title={isHorizontal ? tooltipText : undefined}
-    >
+    <div ref={containerRef} style={containerStyle}>
+      {/* Drag handle - captures mouse events in a small zone around the indicator */}
+      <div
+        style={handleStyle}
+        onMouseDown={handleMouseDown}
+        onMouseEnter={() => setIsHovered(true)}
+        onMouseLeave={() => setIsHovered(false)}
+        // Horizontal uses native title (simpler, no clipping issues with wide tooltips)
+        title={isHorizontal ? tooltipText : undefined}
+      />
+
       {/* Visual indicator - pointer events disabled */}
       <div style={indicatorStyle}>
         <Triangle direction={isHorizontal ? "down" : "right"} color={color} />
diff --git a/src/browser/utils/compaction/autoCompactionCheck.test.ts b/src/browser/utils/compaction/autoCompactionCheck.test.ts
@@ -5,14 +5,18 @@ import type { ChatUsageDisplay } from "@/common/utils/tokens/usageAggregator";
 import { KNOWN_MODELS } from "@/common/constants/knownModels";
 
 // Helper to create a mock usage entry
+// contextTokens is the total INPUT-side tokens (input + cached + cacheCreate)
+// This matches what counts against the context window limit
 const createUsageEntry = (
-  tokens: number,
+  contextTokens: number,
   model: string = KNOWN_MODELS.SONNET.id
 ): ChatUsageDisplay => {
-  // Distribute tokens across different types (realistic pattern)
-  const inputTokens = Math.floor(tokens * 0.6); // 60% input
-  const outputTokens = Math.floor(tokens * 0.3); // 30% output
-  const cachedTokens = Math.floor(tokens * 0.1); // 10% cached
+  // Distribute context tokens across input types (all count against context window)
+  const inputTokens = Math.floor(contextTokens * 0.85); // 85% fresh input
+  const cachedTokens = Math.floor(contextTokens * 0.15); // 15% cached input
+
+  // Output and reasoning are separate (don't count against context)
+  const outputTokens = Math.floor(contextTokens * 0.2); // ~20% of context size
 
   return {
     input: { tokens: inputTokens },
@@ -135,14 +139,15 @@ describe("checkAutoCompaction", () => {
       expect(result.shouldShowWarning).toBe(false);
     });
 
-    test("includes all token types in calculation", () => {
+    test("only counts input-side tokens (input, cached, cacheCreate) for context window", () => {
       // Create usage with all token types specified
+      // Only input-side tokens should count against context window
       const usageEntry = {
         input: { tokens: 10_000 },
         cached: { tokens: 5_000 },
         cacheCreate: { tokens: 2_000 },
-        output: { tokens: 3_000 },
-        reasoning: { tokens: 1_000 },
+        output: { tokens: 3_000 }, // Should NOT count
+        reasoning: { tokens: 1_000 }, // Should NOT count
         model: KNOWN_MODELS.SONNET.id,
       };
       const usage: WorkspaceUsageState = {
@@ -153,8 +158,9 @@ describe("checkAutoCompaction", () => {
 
       const result = checkAutoCompaction(usage, KNOWN_MODELS.SONNET.id, false);
 
-      // Total: 10k + 5k + 2k + 3k + 1k = 21k tokens = 10.5%
-      expect(result.usagePercentage).toBe(10.5);
+      // Context tokens: 10k + 5k + 2k = 17k (output/reasoning excluded)
+      // 17,000 / 200,000 = 8.5%
+      expect(result.usagePercentage).toBe(8.5);
     });
   });
 
diff --git a/src/browser/utils/compaction/autoCompactionCheck.ts b/src/browser/utils/compaction/autoCompactionCheck.ts
@@ -24,15 +24,13 @@ import {
   FORCE_COMPACTION_BUFFER_PERCENT,
 } from "@/common/constants/ui";
 
-/** Sum all token components from a ChatUsageDisplay */
-function getTotalTokens(usage: ChatUsageDisplay): number {
-  return (
-    usage.input.tokens +
-    usage.cached.tokens +
-    usage.cacheCreate.tokens +
-    usage.output.tokens +
-    usage.reasoning.tokens
-  );
+/**
+ * Get context window token count from a ChatUsageDisplay.
+ * Only includes input-side tokens (what's sent to the model).
+ * Output and reasoning tokens don't count against context window limits.
+ */
+function getContextTokens(usage: ChatUsageDisplay): number {
+  return usage.input.tokens + usage.cached.tokens + usage.cacheCreate.tokens;
 }
 
 export interface AutoCompactionCheckResult {
@@ -100,15 +98,15 @@ export function checkAutoCompaction(
   const currentUsage = usage.liveUsage ?? lastUsage;
 
   // Usage percentage from current context (live when streaming, otherwise last completed)
-  const usagePercentage = currentUsage ? (getTotalTokens(currentUsage) / maxTokens) * 100 : 0;
+  const usagePercentage = currentUsage ? (getContextTokens(currentUsage) / maxTokens) * 100 : 0;
 
   // Force-compact when usage exceeds threshold + buffer
   const forceCompactThreshold = thresholdPercentage + FORCE_COMPACTION_BUFFER_PERCENT;
   const shouldForceCompact = usagePercentage >= forceCompactThreshold;
 
   // Warning uses max of last completed and current (live when streaming)
   // This ensures warning shows when live usage spikes above threshold mid-stream
-  const lastUsagePercentage = lastUsage ? (getTotalTokens(lastUsage) / maxTokens) * 100 : 0;
+  const lastUsagePercentage = lastUsage ? (getContextTokens(lastUsage) / maxTokens) * 100 : 0;
   const shouldShowWarning =
     Math.max(lastUsagePercentage, usagePercentage) >= thresholdPercentage - warningAdvancePercent;
 
diff --git a/src/common/utils/tokens/tokenMeterUtils.ts b/src/common/utils/tokens/tokenMeterUtils.ts
@@ -63,12 +63,12 @@ export function calculateTokenMeterData(
   const modelStats = getModelStats(model);
   const maxTokens = use1M && supports1MContext(model) ? 1_000_000 : modelStats?.max_input_tokens;
 
-  const totalUsed =
-    usage.input.tokens +
-    usage.cached.tokens +
-    usage.cacheCreate.tokens +
-    usage.output.tokens +
-    usage.reasoning.tokens;
+  // Context window only includes input-side tokens (what's sent to the model)
+  // Output and reasoning tokens don't count against context window limits
+  const contextUsed = usage.input.tokens + usage.cached.tokens + usage.cacheCreate.tokens;
+
+  // Total tokens across all categories (for proportional segment sizing)
+  const totalUsed = contextUsed + usage.output.tokens + usage.reasoning.tokens;
 
   const toPercentage = (tokens: number) => {
     if (verticalProportions) {
@@ -84,7 +84,8 @@ export function calculateTokenMeterData(
     color: def.color,
   }));
 
-  const contextPercentage = maxTokens ? (totalUsed / maxTokens) * 100 : 100;
+  // Context percentage based only on input-side tokens
+  const contextPercentage = maxTokens ? (contextUsed / maxTokens) * 100 : 100;
 
   return {
     segments,