From 9c89d47b037544fe8dca97f0d260d54628e5a646 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Mon, 1 Dec 2025 22:41:47 -0600
Subject: [PATCH 01/18] =?UTF-8?q?=F0=9F=A4=96=20feat:=20add=20voice=20inpu?=
 =?UTF-8?q?t=20mode=20using=20OpenAI=20Whisper?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add Ctrl+D / Cmd+D keybind to toggle voice recording
- Add mic button next to send button (hidden on mobile or when no OpenAI key)
- Add command palette command for voice input toggle
- Record audio via MediaRecorder, transcribe via Whisper API
- Show recording indicator while capturing, spinner while transcribing
- Append dictated text to existing input
- Handle errors with user-friendly toast messages

Requires OpenAI API key to be configured in Settings > Providers.

_Generated with mux_
---
 bun.lock                                      |   3 +
 package.json                                  |   1 +
 src/browser/api.ts                            |   3 +
 .../components/ChatInput/VoiceInputButton.tsx |  66 ++++++
 src/browser/components/ChatInput/index.tsx    | 178 ++++++++++++++--
 src/browser/hooks/useVoiceInput.ts            | 193 ++++++++++++++++++
 src/browser/stories/mockFactory.ts            |   3 +
 src/browser/utils/commandIds.ts               |   1 +
 src/browser/utils/commands/sources.ts         |  11 +
 src/browser/utils/ui/keybinds.ts              |   5 +
 src/common/constants/events.ts                |   7 +
 src/common/constants/ipc-constants.ts         |   3 +
 src/common/types/ipc.ts                       |   4 +
 src/desktop/preload.ts                        |   4 +
 src/node/services/ipcMain.ts                  |  40 ++++
 15 files changed, 501 insertions(+), 21 deletions(-)
 create mode 100644 src/browser/components/ChatInput/VoiceInputButton.tsx
 create mode 100644 src/browser/hooks/useVoiceInput.ts

diff --git a/bun.lock b/bun.lock
index 899b1121ae..32ee53eb40 100644
--- a/bun.lock
+++ b/bun.lock
@@ -42,6 +42,7 @@
         "minimist": "^1.2.8",
         "motion": "^12.23.24",
         "ollama-ai-provider-v2": "^1.5.4",
+        "openai": "^6.9.1",
         "rehype-harden": "^1.1.5",
         "shescape": "^2.1.6",
         "source-map-support": "^0.5.21",
@@ -2688,6 +2689,8 @@
 
     "oniguruma-to-es": ["oniguruma-to-es@4.3.3", "", { "dependencies": { "oniguruma-parser": "^0.12.1", "regex": "^6.0.1", "regex-recursion": "^6.0.2" } }, "sha512-rPiZhzC3wXwE59YQMRDodUwwT9FZ9nNBwQQfsd1wfdtlKEyCdRV0avrTcSZ5xlIvGRVPd/cx6ZN45ECmS39xvg=="],
 
+    "openai": ["openai@6.9.1", "", { "peerDependencies": { "ws": "^8.18.0", "zod": "^3.25 || ^4.0" }, "optionalPeers": ["ws", "zod"], "bin": { "openai": "bin/cli" } }, "sha512-vQ5Rlt0ZgB3/BNmTa7bIijYFhz3YBceAA3Z4JuoMSBftBF9YqFHIEhZakSs+O/Ad7EaoEimZvHxD5ylRjN11Lg=="],
+
     "optionator": ["optionator@0.9.4", "", { "dependencies": { "deep-is": "^0.1.3", "fast-levenshtein": "^2.0.6", "levn": "^0.4.1", "prelude-ls": "^1.2.1", "type-check": "^0.4.0", "word-wrap": "^1.2.5" } }, "sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g=="],
 
     "ora": ["ora@5.4.1", "", { "dependencies": { "bl": "^4.1.0", "chalk": "^4.1.0", "cli-cursor": "^3.1.0", "cli-spinners": "^2.5.0", "is-interactive": "^1.0.0", "is-unicode-supported": "^0.1.0", "log-symbols": "^4.1.0", "strip-ansi": "^6.0.0", "wcwidth": "^1.0.1" } }, "sha512-5b6Y85tPxZZ7QytO+BQzysW31HJku27cRIlkbAXaNx+BdcVi+LlRFmVXzeF6a7JCwJpyw5c4b+YSVImQIrBpuQ=="],
diff --git a/package.json b/package.json
index 80ea0a48c4..28b2be79e3 100644
--- a/package.json
+++ b/package.json
@@ -83,6 +83,7 @@
     "minimist": "^1.2.8",
     "motion": "^12.23.24",
     "ollama-ai-provider-v2": "^1.5.4",
+    "openai": "^6.9.1",
     "rehype-harden": "^1.1.5",
     "shescape": "^2.1.6",
     "source-map-support": "^0.5.21",
diff --git a/src/browser/api.ts b/src/browser/api.ts
index 33b9ad37ab..a98675cb4a 100644
--- a/src/browser/api.ts
+++ b/src/browser/api.ts
@@ -361,6 +361,9 @@ const webApi: IPCApi = {
     },
     closeWindow: (workspaceId) => invokeIPC(IPC_CHANNELS.TERMINAL_WINDOW_CLOSE, workspaceId),
   },
+  voice: {
+    transcribe: (audioBase64) => invokeIPC(IPC_CHANNELS.VOICE_TRANSCRIBE, audioBase64),
+  },
   update: {
     check: () => invokeIPC(IPC_CHANNELS.UPDATE_CHECK),
     download: () => invokeIPC(IPC_CHANNELS.UPDATE_DOWNLOAD),
diff --git a/src/browser/components/ChatInput/VoiceInputButton.tsx b/src/browser/components/ChatInput/VoiceInputButton.tsx
new file mode 100644
index 0000000000..9673b36f68
--- /dev/null
+++ b/src/browser/components/ChatInput/VoiceInputButton.tsx
@@ -0,0 +1,66 @@
+/**
+ * Voice input button - floats inside the chat input textarea.
+ * Minimal footprint: just an icon that changes color based on state.
+ *
+ * Visual states:
+ * - Idle: Subtle gray mic icon
+ * - Recording: Red pulsing mic
+ * - Transcribing: Orange spinning loader
+ * - Hidden: When on mobile, unsupported, or no OpenAI key
+ */
+
+import React from "react";
+import { Mic, Loader2 } from "lucide-react";
+import { TooltipWrapper, Tooltip } from "../Tooltip";
+import { formatKeybind, KEYBINDS } from "@/browser/utils/ui/keybinds";
+import { cn } from "@/common/lib/utils";
+
+interface VoiceInputButtonProps {
+  isListening: boolean;
+  isTranscribing: boolean;
+  isSupported: boolean;
+  shouldShowUI: boolean;
+  onToggle: () => void;
+  disabled?: boolean;
+}
+
+export const VoiceInputButton: React.FC<VoiceInputButtonProps> = (props) => {
+  // Don't render if we shouldn't show UI (mobile, unsupported, or no OpenAI key)
+  if (!props.shouldShowUI) {
+    return null;
+  }
+
+  const label = props.isTranscribing
+    ? "Transcribing..."
+    : props.isListening
+      ? "Stop recording"
+      : "Voice input";
+
+  const Icon = props.isTranscribing ? Loader2 : Mic;
+
+  return (
+    <TooltipWrapper inline>
+      <button
+        type="button"
+        onClick={props.onToggle}
+        disabled={(props.disabled ?? false) || !props.isSupported || props.isTranscribing}
+        aria-label={label}
+        aria-pressed={props.isListening}
+        className={cn(
+          "inline-flex items-center justify-center rounded p-0.5 transition-colors duration-150",
+          "disabled:cursor-not-allowed disabled:opacity-40",
+          props.isTranscribing
+            ? "text-orange-500"
+            : props.isListening
+              ? "text-red-500 animate-pulse"
+              : "text-muted/50 hover:text-muted"
+        )}
+      >
+        <Icon className={cn("h-4 w-4", props.isTranscribing && "animate-spin")} strokeWidth={1.5} />
+      </button>
+      <Tooltip className="tooltip" align="right">
+        {label} ({formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT)})
+      </Tooltip>
+    </TooltipWrapper>
+  );
+};
diff --git a/src/browser/components/ChatInput/index.tsx b/src/browser/components/ChatInput/index.tsx
index 0a686843be..d7da34984c 100644
--- a/src/browser/components/ChatInput/index.tsx
+++ b/src/browser/components/ChatInput/index.tsx
@@ -65,6 +65,8 @@ import { cn } from "@/common/lib/utils";
 import { CreationControls } from "./CreationControls";
 import { useCreationWorkspace } from "./useCreationWorkspace";
 import { useTutorial } from "@/browser/contexts/TutorialContext";
+import { useVoiceInput } from "@/browser/hooks/useVoiceInput";
+import { VoiceInputButton } from "./VoiceInputButton";
 
 const LEADING_COMMAND_NOISE = /^(?:\s|\u200B|\u200C|\u200D|\u200E|\u200F|\uFEFF)+/;
 
@@ -154,6 +156,45 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
   });
   const { startSequence: startTutorial } = useTutorial();
 
+  // Track if OpenAI API key is configured for voice input
+  const [openAIKeySet, setOpenAIKeySet] = useState(false);
+
+  // Voice input handling - appends transcribed text to input
+  const handleVoiceTranscript = useCallback(
+    (text: string, _isFinal: boolean) => {
+      // Whisper only returns final results, append to input with space separator if needed
+      setInput((prev) => {
+        const separator = prev.length > 0 && !prev.endsWith(" ") ? " " : "";
+        return prev + separator + text;
+      });
+    },
+    [setInput]
+  );
+
+  const handleVoiceError = useCallback(
+    (error: string) => {
+      // Map common errors to user-friendly messages
+      const errorMessages: Record<string, string> = {
+        "not-allowed": "Microphone access denied. Please allow microphone access and try again.",
+        "no-speech": "No speech detected. Please try again.",
+        network: "Network error. Please check your connection.",
+        "audio-capture": "No microphone found. Please connect a microphone.",
+      };
+      setToast({
+        id: Date.now().toString(),
+        type: "error",
+        message: errorMessages[error] ?? `Voice input error: ${error}`,
+      });
+    },
+    [setToast]
+  );
+
+  const voiceInput = useVoiceInput({
+    onTranscript: handleVoiceTranscript,
+    onError: handleVoiceError,
+    openAIKeySet,
+  });
+
   // Start creation tutorial when entering creation mode
   useEffect(() => {
     if (variant === "creation") {
@@ -370,6 +411,28 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
     };
   }, []);
 
+  // Check if OpenAI API key is configured (for voice input)
+  useEffect(() => {
+    let isMounted = true;
+
+    const checkOpenAIKey = async () => {
+      try {
+        const config = await window.api.providers.getConfig();
+        if (isMounted) {
+          setOpenAIKeySet(config.openai?.apiKeySet ?? false);
+        }
+      } catch (error) {
+        console.error("Failed to check OpenAI API key:", error);
+      }
+    };
+
+    void checkOpenAIKey();
+
+    return () => {
+      isMounted = false;
+    };
+  }, []);
+
   // Allow external components (e.g., CommandPalette, Queued message edits) to insert text
   useEffect(() => {
     const handler = (e: Event) => {
@@ -437,6 +500,18 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
       window.removeEventListener(CUSTOM_EVENTS.THINKING_LEVEL_TOAST, handler as EventListener);
   }, [variant, props, setToast]);
 
+  // Listen for voice input toggle from command palette
+  useEffect(() => {
+    if (!voiceInput.shouldShowUI) return;
+
+    const handler = () => {
+      voiceInput.toggleListening();
+    };
+    window.addEventListener(CUSTOM_EVENTS.TOGGLE_VOICE_INPUT, handler as EventListener);
+    return () =>
+      window.removeEventListener(CUSTOM_EVENTS.TOGGLE_VOICE_INPUT, handler as EventListener);
+  }, [voiceInput]);
+
   // Auto-focus chat input when workspace changes (workspace only)
   const workspaceIdForFocus = variant === "workspace" ? props.workspaceId : null;
   useEffect(() => {
@@ -768,6 +843,13 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
       return;
     }
 
+    // Handle voice input toggle (Ctrl+D / Cmd+D)
+    if (matchesKeybind(e, KEYBINDS.TOGGLE_VOICE_INPUT) && voiceInput.shouldShowUI) {
+      e.preventDefault();
+      voiceInput.toggleListening();
+      return;
+    }
+
     // Handle open model selector
     if (matchesKeybind(e, KEYBINDS.OPEN_MODEL_SELECTOR)) {
       e.preventDefault();
@@ -896,27 +978,81 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
             anchorRef={variant === "creation" ? inputRef : undefined}
           />
 
-          <div className="flex items-end" data-component="ChatInputControls">
-            <VimTextArea
-              ref={inputRef}
-              value={input}
-              isEditing={!!editingMessage}
-              mode={mode}
-              onChange={setInput}
-              onKeyDown={handleKeyDown}
-              onPaste={handlePaste}
-              onDragOver={handleDragOver}
-              onDrop={handleDrop}
-              suppressKeys={showCommandSuggestions ? COMMAND_SUGGESTION_KEYS : undefined}
-              placeholder={placeholder}
-              disabled={!editingMessage && (disabled || isSending)}
-              aria-label={editingMessage ? "Edit your last message" : "Message Claude"}
-              aria-autocomplete="list"
-              aria-controls={
-                showCommandSuggestions && commandSuggestions.length > 0 ? commandListId : undefined
-              }
-              aria-expanded={showCommandSuggestions && commandSuggestions.length > 0}
-            />
+          <div className="relative flex items-end" data-component="ChatInputControls">
+            {/* Recording overlay - dramatically replaces textarea when recording */}
+            {voiceInput.isListening ? (
+              <button
+                type="button"
+                onClick={voiceInput.toggleListening}
+                className="flex min-h-[60px] w-full cursor-pointer items-center justify-center gap-3 rounded-md border-2 border-red-500 bg-red-500/10 px-4 py-4 transition-all"
+                aria-label="Stop recording"
+              >
+                {/* Animated waveform bars */}
+                <div className="flex items-center gap-1">
+                  {[0, 1, 2, 3, 4].map((i) => (
+                    <div
+                      key={i}
+                      className="w-1 rounded-full bg-red-500"
+                      style={{
+                        height: `${12 + Math.sin(i * 0.8) * 8}px`,
+                        animation: `pulse 0.8s ease-in-out ${i * 0.1}s infinite alternate`,
+                      }}
+                    />
+                  ))}
+                </div>
+                <span className="text-sm font-medium text-red-500">
+                  Recording... tap to stop ({formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT)})
+                </span>
+                <div className="flex items-center gap-1">
+                  {[0, 1, 2, 3, 4].map((i) => (
+                    <div
+                      key={i}
+                      className="w-1 rounded-full bg-red-500"
+                      style={{
+                        height: `${12 + Math.sin((4 - i) * 0.8) * 8}px`,
+                        animation: `pulse 0.8s ease-in-out ${(4 - i) * 0.1}s infinite alternate`,
+                      }}
+                    />
+                  ))}
+                </div>
+              </button>
+            ) : (
+              <>
+                <VimTextArea
+                  ref={inputRef}
+                  value={input}
+                  isEditing={!!editingMessage}
+                  mode={mode}
+                  onChange={setInput}
+                  onKeyDown={handleKeyDown}
+                  onPaste={handlePaste}
+                  onDragOver={handleDragOver}
+                  onDrop={handleDrop}
+                  suppressKeys={showCommandSuggestions ? COMMAND_SUGGESTION_KEYS : undefined}
+                  placeholder={placeholder}
+                  disabled={!editingMessage && (disabled || isSending)}
+                  aria-label={editingMessage ? "Edit your last message" : "Message Claude"}
+                  aria-autocomplete="list"
+                  aria-controls={
+                    showCommandSuggestions && commandSuggestions.length > 0
+                      ? commandListId
+                      : undefined
+                  }
+                  aria-expanded={showCommandSuggestions && commandSuggestions.length > 0}
+                />
+                {/* Floating voice input button inside textarea */}
+                <div className="absolute bottom-2 right-2">
+                  <VoiceInputButton
+                    isListening={voiceInput.isListening}
+                    isTranscribing={voiceInput.isTranscribing}
+                    isSupported={voiceInput.isSupported}
+                    shouldShowUI={voiceInput.shouldShowUI}
+                    onToggle={voiceInput.toggleListening}
+                    disabled={disabled || isSending}
+                  />
+                </div>
+              </>
+            )}
           </div>
 
           {/* Image attachments */}
diff --git a/src/browser/hooks/useVoiceInput.ts b/src/browser/hooks/useVoiceInput.ts
new file mode 100644
index 0000000000..e92b6e884e
--- /dev/null
+++ b/src/browser/hooks/useVoiceInput.ts
@@ -0,0 +1,193 @@
+/**
+ * Hook for voice input using OpenAI Whisper API via MediaRecorder.
+ *
+ * Features:
+ * - Records audio using MediaRecorder (webm/opus format)
+ * - Sends to backend which calls OpenAI Whisper for transcription
+ * - Shows recording state while capturing
+ * - Shows transcribing state while processing
+ * - Hidden on mobile (native keyboards have built-in dictation)
+ * - Disabled when OpenAI API key not configured
+ */
+
+import { useState, useCallback, useRef, useEffect } from "react";
+
+// Check if we're on a mobile device (touch-based)
+function isMobileDevice(): boolean {
+  if (typeof window === "undefined") return false;
+  // Check for touch capability and small screen as heuristics
+  return ("ontouchstart" in window || navigator.maxTouchPoints > 0) && window.innerWidth < 768;
+}
+
+// Check if MediaRecorder is available
+function isMediaRecorderSupported(): boolean {
+  return typeof window !== "undefined" && typeof MediaRecorder !== "undefined";
+}
+
+export interface UseVoiceInputOptions {
+  /** Called when transcript text is received */
+  onTranscript: (text: string, isFinal: boolean) => void;
+  /** Called when an error occurs */
+  onError?: (error: string) => void;
+  /** Whether OpenAI API key is configured */
+  openAIKeySet: boolean;
+}
+
+export interface UseVoiceInputResult {
+  /** Whether voice input is currently recording */
+  isListening: boolean;
+  /** Whether transcription is in progress */
+  isTranscribing: boolean;
+  /** Whether the browser supports MediaRecorder */
+  isSupported: boolean;
+  /** Whether we should show voice UI (supported, not mobile, API key set) */
+  shouldShowUI: boolean;
+  /** Start recording for voice input */
+  startListening: () => void;
+  /** Stop recording and transcribe */
+  stopListening: () => void;
+  /** Toggle recording state */
+  toggleListening: () => void;
+}
+
+export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResult {
+  const { onTranscript, onError, openAIKeySet } = options;
+
+  const [isListening, setIsListening] = useState(false);
+  const [isTranscribing, setIsTranscribing] = useState(false);
+
+  const mediaRecorderRef = useRef<MediaRecorder | null>(null);
+  const audioChunksRef = useRef<Blob[]>([]);
+  const streamRef = useRef<MediaStream | null>(null);
+
+  const isSupported = isMediaRecorderSupported();
+  const isMobile = isMobileDevice();
+
+  // Store callbacks in refs to avoid recreating on every render
+  const onTranscriptRef = useRef(onTranscript);
+  const onErrorRef = useRef(onError);
+  useEffect(() => {
+    onTranscriptRef.current = onTranscript;
+    onErrorRef.current = onError;
+  }, [onTranscript, onError]);
+
+  const transcribeAudio = useCallback(async (audioBlob: Blob) => {
+    setIsTranscribing(true);
+    try {
+      // Convert blob to base64
+      const arrayBuffer = await audioBlob.arrayBuffer();
+      const base64 = btoa(
+        new Uint8Array(arrayBuffer).reduce((data, byte) => data + String.fromCharCode(byte), "")
+      );
+
+      // Call backend to transcribe
+      const result = await window.api.voice.transcribe(base64);
+
+      if (result.success) {
+        if (result.data.trim()) {
+          onTranscriptRef.current(result.data, true);
+        }
+      } else {
+        onErrorRef.current?.(result.error);
+      }
+    } catch (err) {
+      const message = err instanceof Error ? err.message : String(err);
+      onErrorRef.current?.(`Transcription failed: ${message}`);
+    } finally {
+      setIsTranscribing(false);
+    }
+  }, []);
+
+  const startListening = useCallback(async () => {
+    if (!isSupported || isListening || isTranscribing || !openAIKeySet) return;
+
+    try {
+      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+      streamRef.current = stream;
+
+      // Use webm/opus which is well supported and works with Whisper
+      const mimeType = MediaRecorder.isTypeSupported("audio/webm;codecs=opus")
+        ? "audio/webm;codecs=opus"
+        : "audio/webm";
+
+      const mediaRecorder = new MediaRecorder(stream, { mimeType });
+      audioChunksRef.current = [];
+
+      mediaRecorder.ondataavailable = (event) => {
+        if (event.data.size > 0) {
+          audioChunksRef.current.push(event.data);
+        }
+      };
+
+      mediaRecorder.onstop = () => {
+        const audioBlob = new Blob(audioChunksRef.current, { type: mimeType });
+        audioChunksRef.current = [];
+
+        // Stop all tracks to release microphone
+        stream.getTracks().forEach((track) => track.stop());
+        streamRef.current = null;
+
+        // Transcribe the audio
+        void transcribeAudio(audioBlob);
+      };
+
+      mediaRecorder.onerror = () => {
+        onErrorRef.current?.("Recording failed");
+        setIsListening(false);
+        stream.getTracks().forEach((track) => track.stop());
+        streamRef.current = null;
+      };
+
+      mediaRecorderRef.current = mediaRecorder;
+      mediaRecorder.start();
+      setIsListening(true);
+    } catch (err) {
+      const message = err instanceof Error ? err.message : String(err);
+      if (message.includes("Permission denied") || message.includes("NotAllowedError")) {
+        onErrorRef.current?.(
+          "Microphone access denied. Please allow microphone access and try again."
+        );
+      } else {
+        onErrorRef.current?.(`Failed to start recording: ${message}`);
+      }
+    }
+  }, [isSupported, isListening, isTranscribing, openAIKeySet, transcribeAudio]);
+
+  const stopListening = useCallback(() => {
+    if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") {
+      mediaRecorderRef.current.stop();
+      mediaRecorderRef.current = null;
+    }
+    setIsListening(false);
+  }, []);
+
+  const toggleListening = useCallback(() => {
+    if (isListening) {
+      stopListening();
+    } else {
+      void startListening();
+    }
+  }, [isListening, startListening, stopListening]);
+
+  // Cleanup on unmount
+  useEffect(() => {
+    return () => {
+      if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") {
+        mediaRecorderRef.current.stop();
+      }
+      if (streamRef.current) {
+        streamRef.current.getTracks().forEach((track) => track.stop());
+      }
+    };
+  }, []);
+
+  return {
+    isListening,
+    isTranscribing,
+    isSupported,
+    shouldShowUI: isSupported && !isMobile && openAIKeySet,
+    startListening: () => void startListening(),
+    stopListening,
+    toggleListening,
+  };
+}
diff --git a/src/browser/stories/mockFactory.ts b/src/browser/stories/mockFactory.ts
index 29916ebca6..3de5878047 100644
--- a/src/browser/stories/mockFactory.ts
+++ b/src/browser/stories/mockFactory.ts
@@ -456,6 +456,9 @@ export function createMockAPI(options: MockAPIOptions): IPCApi {
       openWindow: () => Promise.resolve(undefined),
       closeWindow: () => Promise.resolve(undefined),
     },
+    voice: {
+      transcribe: () => Promise.resolve({ success: false, error: "Not implemented in mock" }),
+    },
     update: {
       check: () => Promise.resolve(undefined),
       download: () => Promise.resolve(undefined),
diff --git a/src/browser/utils/commandIds.ts b/src/browser/utils/commandIds.ts
index 8976082fc4..cc92aa0057 100644
--- a/src/browser/utils/commandIds.ts
+++ b/src/browser/utils/commandIds.ts
@@ -39,6 +39,7 @@ export const CommandIds = {
   chatTruncate: (pct: number) => `${COMMAND_ID_PREFIXES.CHAT_TRUNCATE}${pct}` as const,
   chatInterrupt: () => "chat:interrupt" as const,
   chatJumpBottom: () => "chat:jumpBottom" as const,
+  chatVoiceInput: () => "chat:voiceInput" as const,
 
   // Mode commands
   modeToggle: () => "mode:toggle" as const,
diff --git a/src/browser/utils/commands/sources.ts b/src/browser/utils/commands/sources.ts
index 09029e5f44..8738f6b5ee 100644
--- a/src/browser/utils/commands/sources.ts
+++ b/src/browser/utils/commands/sources.ts
@@ -388,6 +388,17 @@ export function buildCoreSources(p: BuildSourcesParams): Array<() => CommandActi
           window.dispatchEvent(ev);
         },
       });
+      list.push({
+        id: CommandIds.chatVoiceInput(),
+        title: "Toggle Voice Input",
+        subtitle: "Dictate instead of typing",
+        section: section.chat,
+        shortcutHint: formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT),
+        run: () => {
+          // Dispatch custom event; ChatInput listens for it
+          window.dispatchEvent(createCustomEvent(CUSTOM_EVENTS.TOGGLE_VOICE_INPUT));
+        },
+      });
     }
     return list;
   });
diff --git a/src/browser/utils/ui/keybinds.ts b/src/browser/utils/ui/keybinds.ts
index 0a85f645b4..0bc979c542 100644
--- a/src/browser/utils/ui/keybinds.ts
+++ b/src/browser/utils/ui/keybinds.ts
@@ -285,4 +285,9 @@ export const KEYBINDS = {
   /** Open settings modal */
   // macOS: Cmd+, Win/Linux: Ctrl+,
   OPEN_SETTINGS: { key: ",", ctrl: true },
+
+  /** Toggle voice input (dictation) */
+  // macOS: Cmd+D, Win/Linux: Ctrl+D
+  // "D" for Dictate - intuitive and available
+  TOGGLE_VOICE_INPUT: { key: "d", ctrl: true },
 } as const;
diff --git a/src/common/constants/events.ts b/src/common/constants/events.ts
index ccbd592113..807bc226fa 100644
--- a/src/common/constants/events.ts
+++ b/src/common/constants/events.ts
@@ -56,6 +56,12 @@ export const CUSTOM_EVENTS = {
    * Detail: { projectPath: string, startMessage?: string, model?: string, trunkBranch?: string, runtime?: string }
    */
   START_WORKSPACE_CREATION: "mux:startWorkspaceCreation",
+
+  /**
+   * Event to toggle voice input (dictation) mode
+   * No detail
+   */
+  TOGGLE_VOICE_INPUT: "mux:toggleVoiceInput",
 } as const;
 
 /**
@@ -94,6 +100,7 @@ export interface CustomEventPayloads {
     trunkBranch?: string;
     runtime?: string;
   };
+  [CUSTOM_EVENTS.TOGGLE_VOICE_INPUT]: never; // No payload
 }
 
 /**
diff --git a/src/common/constants/ipc-constants.ts b/src/common/constants/ipc-constants.ts
index be7bc45ccf..c335928a09 100644
--- a/src/common/constants/ipc-constants.ts
+++ b/src/common/constants/ipc-constants.ts
@@ -68,6 +68,9 @@ export const IPC_CHANNELS = {
   TOKENIZER_COUNT_TOKENS: "tokenizer:countTokens",
   TOKENIZER_COUNT_TOKENS_BATCH: "tokenizer:countTokensBatch",
 
+  // Voice channels
+  VOICE_TRANSCRIBE: "voice:transcribe",
+
   // Dynamic channel prefixes
   WORKSPACE_CHAT_PREFIX: "workspace:chat:",
   WORKSPACE_METADATA: "workspace:metadata",
diff --git a/src/common/types/ipc.ts b/src/common/types/ipc.ts
index 22f844e504..d07846257e 100644
--- a/src/common/types/ipc.ts
+++ b/src/common/types/ipc.ts
@@ -371,6 +371,10 @@ export interface IPCApi {
     openWindow(workspaceId: string): Promise<void>;
     closeWindow(workspaceId: string): Promise<void>;
   };
+  voice: {
+    /** Transcribe audio using OpenAI Whisper. Audio should be base64-encoded webm/opus. */
+    transcribe(audioBase64: string): Promise<Result<string, string>>;
+  };
   update: {
     check(): Promise<void>;
     download(): Promise<void>;
diff --git a/src/desktop/preload.ts b/src/desktop/preload.ts
index 8a9ea1c71c..8ac8a5f8d7 100644
--- a/src/desktop/preload.ts
+++ b/src/desktop/preload.ts
@@ -160,6 +160,10 @@ const api: IPCApi = {
   window: {
     setTitle: (title: string) => ipcRenderer.invoke(IPC_CHANNELS.WINDOW_SET_TITLE, title),
   },
+  voice: {
+    transcribe: (audioBase64: string) =>
+      ipcRenderer.invoke(IPC_CHANNELS.VOICE_TRANSCRIBE, audioBase64),
+  },
   update: {
     check: () => ipcRenderer.invoke(IPC_CHANNELS.UPDATE_CHECK),
     download: () => ipcRenderer.invoke(IPC_CHANNELS.UPDATE_DOWNLOAD),
diff --git a/src/node/services/ipcMain.ts b/src/node/services/ipcMain.ts
index 7de28a0d7e..9fa51fb784 100644
--- a/src/node/services/ipcMain.ts
+++ b/src/node/services/ipcMain.ts
@@ -44,6 +44,7 @@ import { PTYService } from "@/node/services/ptyService";
 import type { TerminalWindowManager } from "@/desktop/terminalWindowManager";
 import type { TerminalCreateParams, TerminalResizeParams } from "@/common/types/terminal";
 import { ExtensionMetadataService } from "@/node/services/ExtensionMetadataService";
+import OpenAI from "openai";
 
 /** Maximum number of retry attempts when workspace name collides */
 const MAX_WORKSPACE_NAME_COLLISION_RETRIES = 3;
@@ -633,6 +634,45 @@ export class IpcMain {
   }
 
   private registerWorkspaceHandlers(ipcMain: ElectronIpcMain): void {
+    // Voice transcription handler (uses OpenAI Whisper)
+    ipcMain.handle(
+      IPC_CHANNELS.VOICE_TRANSCRIBE,
+      async (_event, audioBase64: string): Promise<Result<string, string>> => {
+        try {
+          // Get OpenAI config
+          const providersConfig = this.config.loadProvidersConfig();
+          const openaiConfig = providersConfig?.openai;
+
+          if (!openaiConfig?.apiKey) {
+            return Err("OpenAI API key not configured. Set it in Settings > Providers.");
+          }
+
+          const client = new OpenAI({
+            apiKey: openaiConfig.apiKey,
+            baseURL: openaiConfig.baseUrl ?? openaiConfig.baseURL,
+          });
+
+          // Convert base64 to buffer
+          const audioBuffer = Buffer.from(audioBase64, "base64");
+
+          // Create a File object for the API
+          const audioFile = new File([audioBuffer], "audio.webm", { type: "audio/webm" });
+
+          // Call Whisper API
+          const transcription = await client.audio.transcriptions.create({
+            file: audioFile,
+            model: "whisper-1",
+          });
+
+          return Ok(transcription.text);
+        } catch (error) {
+          const message = error instanceof Error ? error.message : String(error);
+          log.error("[IpcMain] Voice transcription failed", error);
+          return Err(`Transcription failed: ${message}`);
+        }
+      }
+    );
+
     ipcMain.handle(
       IPC_CHANNELS.WORKSPACE_CREATE,
       async (

From e54dbf9b7a5f9b109b64e3fccaf943f4c15c47d8 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Mon, 1 Dec 2025 23:34:52 -0600
Subject: [PATCH 02/18] feat: improve voice recording UI states

- Show overlay during both recording AND transcribing states
  (prevents jarring snap-back to empty textarea when waiting for API)
- Change colors from red (error-like) to blue (recording) and amber (transcribing)
- Disable overlay button while transcribing to prevent double-clicks
---
 .../components/ChatInput/VoiceInputButton.tsx |  8 ++--
 src/browser/components/ChatInput/index.tsx    | 39 ++++++++++++++-----
 2 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/src/browser/components/ChatInput/VoiceInputButton.tsx b/src/browser/components/ChatInput/VoiceInputButton.tsx
index 9673b36f68..15548135bc 100644
--- a/src/browser/components/ChatInput/VoiceInputButton.tsx
+++ b/src/browser/components/ChatInput/VoiceInputButton.tsx
@@ -4,8 +4,8 @@
  *
  * Visual states:
  * - Idle: Subtle gray mic icon
- * - Recording: Red pulsing mic
- * - Transcribing: Orange spinning loader
+ * - Recording: Blue pulsing mic
+ * - Transcribing: Amber spinning loader
  * - Hidden: When on mobile, unsupported, or no OpenAI key
  */
 
@@ -50,9 +50,9 @@ export const VoiceInputButton: React.FC<VoiceInputButtonProps> = (props) => {
           "inline-flex items-center justify-center rounded p-0.5 transition-colors duration-150",
           "disabled:cursor-not-allowed disabled:opacity-40",
           props.isTranscribing
-            ? "text-orange-500"
+            ? "text-amber-500"
             : props.isListening
-              ? "text-red-500 animate-pulse"
+              ? "text-blue-500 animate-pulse"
               : "text-muted/50 hover:text-muted"
         )}
       >
diff --git a/src/browser/components/ChatInput/index.tsx b/src/browser/components/ChatInput/index.tsx
index d7da34984c..3a14037c6a 100644
--- a/src/browser/components/ChatInput/index.tsx
+++ b/src/browser/components/ChatInput/index.tsx
@@ -979,20 +979,29 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
           />
 
           <div className="relative flex items-end" data-component="ChatInputControls">
-            {/* Recording overlay - dramatically replaces textarea when recording */}
-            {voiceInput.isListening ? (
+            {/* Recording/transcribing overlay - dramatically replaces textarea */}
+            {voiceInput.isListening || voiceInput.isTranscribing ? (
               <button
                 type="button"
-                onClick={voiceInput.toggleListening}
-                className="flex min-h-[60px] w-full cursor-pointer items-center justify-center gap-3 rounded-md border-2 border-red-500 bg-red-500/10 px-4 py-4 transition-all"
-                aria-label="Stop recording"
+                onClick={voiceInput.isListening ? voiceInput.toggleListening : undefined}
+                disabled={voiceInput.isTranscribing}
+                className={cn(
+                  "flex min-h-[60px] w-full items-center justify-center gap-3 rounded-md border-2 px-4 py-4 transition-all",
+                  voiceInput.isListening
+                    ? "cursor-pointer border-blue-500 bg-blue-500/10"
+                    : "cursor-wait border-amber-500 bg-amber-500/10"
+                )}
+                aria-label={voiceInput.isListening ? "Stop recording" : "Transcribing..."}
               >
                 {/* Animated waveform bars */}
                 <div className="flex items-center gap-1">
                   {[0, 1, 2, 3, 4].map((i) => (
                     <div
                       key={i}
-                      className="w-1 rounded-full bg-red-500"
+                      className={cn(
+                        "w-1 rounded-full",
+                        voiceInput.isListening ? "bg-blue-500" : "bg-amber-500"
+                      )}
                       style={{
                         height: `${12 + Math.sin(i * 0.8) * 8}px`,
                         animation: `pulse 0.8s ease-in-out ${i * 0.1}s infinite alternate`,
@@ -1000,14 +1009,24 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
                     />
                   ))}
                 </div>
-                <span className="text-sm font-medium text-red-500">
-                  Recording... tap to stop ({formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT)})
+                <span
+                  className={cn(
+                    "text-sm font-medium",
+                    voiceInput.isListening ? "text-blue-500" : "text-amber-500"
+                  )}
+                >
+                  {voiceInput.isListening
+                    ? `Recording... tap to stop (${formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT)})`
+                    : "Transcribing..."}
                 </span>
                 <div className="flex items-center gap-1">
                   {[0, 1, 2, 3, 4].map((i) => (
                     <div
                       key={i}
-                      className="w-1 rounded-full bg-red-500"
+                      className={cn(
+                        "w-1 rounded-full",
+                        voiceInput.isListening ? "bg-blue-500" : "bg-amber-500"
+                      )}
                       style={{
                         height: `${12 + Math.sin((4 - i) * 0.8) * 8}px`,
                         animation: `pulse 0.8s ease-in-out ${(4 - i) * 0.1}s infinite alternate`,
@@ -1041,7 +1060,7 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
                   aria-expanded={showCommandSuggestions && commandSuggestions.length > 0}
                 />
                 {/* Floating voice input button inside textarea */}
-                <div className="absolute bottom-2 right-2">
+                <div className="absolute right-2 bottom-2">
                   <VoiceInputButton
                     isListening={voiceInput.isListening}
                     isTranscribing={voiceInput.isTranscribing}

From 5bcfbdd9ccc0f5b7d7d864f076cace210cf515ed Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Mon, 1 Dec 2025 23:39:34 -0600
Subject: [PATCH 03/18] feat: space to send during recording, thinner border

- Space key during recording: stops and sends immediately
- Ctrl+D/Cmd+D: stops recording, keeps text in input (existing)
- Reduced border from border-2 to border (less crowded near controls)
- Updated overlay text to show both shortcuts
---
 src/browser/components/ChatInput/index.tsx | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/browser/components/ChatInput/index.tsx b/src/browser/components/ChatInput/index.tsx
index 3a14037c6a..096447f962 100644
--- a/src/browser/components/ChatInput/index.tsx
+++ b/src/browser/components/ChatInput/index.tsx
@@ -984,9 +984,26 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
               <button
                 type="button"
                 onClick={voiceInput.isListening ? voiceInput.toggleListening : undefined}
+                onKeyDown={(e) => {
+                  // Space stops recording and sends immediately
+                  if (e.key === " " && voiceInput.isListening) {
+                    e.preventDefault();
+                    voiceInput.stopListening();
+                    // Small delay to let transcription complete, then send
+                    // The transcript callback will update input, then we send
+                    const checkAndSend = () => {
+                      if (!voiceInput.isTranscribing) {
+                        void handleSend();
+                      } else {
+                        setTimeout(checkAndSend, 100);
+                      }
+                    };
+                    setTimeout(checkAndSend, 100);
+                  }
+                }}
                 disabled={voiceInput.isTranscribing}
                 className={cn(
-                  "flex min-h-[60px] w-full items-center justify-center gap-3 rounded-md border-2 px-4 py-4 transition-all",
+                  "flex min-h-[60px] w-full items-center justify-center gap-3 rounded-md border px-4 py-4 transition-all",
                   voiceInput.isListening
                     ? "cursor-pointer border-blue-500 bg-blue-500/10"
                     : "cursor-wait border-amber-500 bg-amber-500/10"
@@ -1016,7 +1033,7 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
                   )}
                 >
                   {voiceInput.isListening
-                    ? `Recording... tap to stop (${formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT)})`
+                    ? `Recording... space to send, ${formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT)} to stop`
                     : "Transcribing..."}
                 </span>
                 <div className="flex items-center gap-1">

From 5914ed117b0243e39ec7490f0846e8622793cddc Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Mon, 1 Dec 2025 23:41:08 -0600
Subject: [PATCH 04/18] fix: auto-focus recording button for spacebar, add
 margin

- Auto-focus the recording overlay button so spacebar works
- Add mb-1 margin to prevent border touching controls below
---
 src/browser/components/ChatInput/index.tsx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/browser/components/ChatInput/index.tsx b/src/browser/components/ChatInput/index.tsx
index 096447f962..dd59da95e0 100644
--- a/src/browser/components/ChatInput/index.tsx
+++ b/src/browser/components/ChatInput/index.tsx
@@ -983,6 +983,7 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
             {voiceInput.isListening || voiceInput.isTranscribing ? (
               <button
                 type="button"
+                ref={(el) => el?.focus()}
                 onClick={voiceInput.isListening ? voiceInput.toggleListening : undefined}
                 onKeyDown={(e) => {
                   // Space stops recording and sends immediately
@@ -1003,7 +1004,7 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
                 }}
                 disabled={voiceInput.isTranscribing}
                 className={cn(
-                  "flex min-h-[60px] w-full items-center justify-center gap-3 rounded-md border px-4 py-4 transition-all",
+                  "mb-1 flex min-h-[60px] w-full items-center justify-center gap-3 rounded-md border px-4 py-4 transition-all",
                   voiceInput.isListening
                     ? "cursor-pointer border-blue-500 bg-blue-500/10"
                     : "cursor-wait border-amber-500 bg-amber-500/10"

From ffb6e943b776584c67b5db85202bec9f6b817d5e Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Mon, 1 Dec 2025 23:44:59 -0600
Subject: [PATCH 05/18] fix: properly implement space-to-send with hook
 callback

- Add onSend callback to useVoiceInput options
- Add stopListeningAndSend method that sets a flag before stopping
- When transcription completes, if flag was set, call onSend
- Use setTimeout(0) to let React flush state update before sending
- Simplifies ChatInput code by moving logic into the hook
---
 src/browser/components/ChatInput/index.tsx | 15 +++----------
 src/browser/hooks/useVoiceInput.ts         | 25 ++++++++++++++++++++--
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/src/browser/components/ChatInput/index.tsx b/src/browser/components/ChatInput/index.tsx
index dd59da95e0..309f09341b 100644
--- a/src/browser/components/ChatInput/index.tsx
+++ b/src/browser/components/ChatInput/index.tsx
@@ -192,6 +192,7 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
   const voiceInput = useVoiceInput({
     onTranscript: handleVoiceTranscript,
     onError: handleVoiceError,
+    onSend: () => void handleSend(),
     openAIKeySet,
   });
 
@@ -986,20 +987,10 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
                 ref={(el) => el?.focus()}
                 onClick={voiceInput.isListening ? voiceInput.toggleListening : undefined}
                 onKeyDown={(e) => {
-                  // Space stops recording and sends immediately
+                  // Space stops recording and sends immediately after transcription
                   if (e.key === " " && voiceInput.isListening) {
                     e.preventDefault();
-                    voiceInput.stopListening();
-                    // Small delay to let transcription complete, then send
-                    // The transcript callback will update input, then we send
-                    const checkAndSend = () => {
-                      if (!voiceInput.isTranscribing) {
-                        void handleSend();
-                      } else {
-                        setTimeout(checkAndSend, 100);
-                      }
-                    };
-                    setTimeout(checkAndSend, 100);
+                    voiceInput.stopListeningAndSend();
                   }
                 }}
                 disabled={voiceInput.isTranscribing}
diff --git a/src/browser/hooks/useVoiceInput.ts b/src/browser/hooks/useVoiceInput.ts
index e92b6e884e..d5375b522c 100644
--- a/src/browser/hooks/useVoiceInput.ts
+++ b/src/browser/hooks/useVoiceInput.ts
@@ -29,6 +29,8 @@ export interface UseVoiceInputOptions {
   onTranscript: (text: string, isFinal: boolean) => void;
   /** Called when an error occurs */
   onError?: (error: string) => void;
+  /** Called to send the message (used by stopListeningAndSend) */
+  onSend?: () => void;
   /** Whether OpenAI API key is configured */
   openAIKeySet: boolean;
 }
@@ -46,12 +48,14 @@ export interface UseVoiceInputResult {
   startListening: () => void;
   /** Stop recording and transcribe */
   stopListening: () => void;
+  /** Stop recording, transcribe, and send when done */
+  stopListeningAndSend: () => void;
   /** Toggle recording state */
   toggleListening: () => void;
 }
 
 export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResult {
-  const { onTranscript, onError, openAIKeySet } = options;
+  const { onTranscript, onError, onSend, openAIKeySet } = options;
 
   const [isListening, setIsListening] = useState(false);
   const [isTranscribing, setIsTranscribing] = useState(false);
@@ -59,6 +63,8 @@ export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResul
   const mediaRecorderRef = useRef<MediaRecorder | null>(null);
   const audioChunksRef = useRef<Blob[]>([]);
   const streamRef = useRef<MediaStream | null>(null);
+  // Flag to auto-send after transcription completes
+  const sendAfterTranscribeRef = useRef(false);
 
   const isSupported = isMediaRecorderSupported();
   const isMobile = isMobileDevice();
@@ -66,13 +72,18 @@ export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResul
   // Store callbacks in refs to avoid recreating on every render
   const onTranscriptRef = useRef(onTranscript);
   const onErrorRef = useRef(onError);
+  const onSendRef = useRef(onSend);
   useEffect(() => {
     onTranscriptRef.current = onTranscript;
     onErrorRef.current = onError;
-  }, [onTranscript, onError]);
+    onSendRef.current = onSend;
+  }, [onTranscript, onError, onSend]);
 
   const transcribeAudio = useCallback(async (audioBlob: Blob) => {
     setIsTranscribing(true);
+    const shouldSendAfter = sendAfterTranscribeRef.current;
+    sendAfterTranscribeRef.current = false;
+
     try {
       // Convert blob to base64
       const arrayBuffer = await audioBlob.arrayBuffer();
@@ -86,6 +97,10 @@ export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResul
       if (result.success) {
         if (result.data.trim()) {
           onTranscriptRef.current(result.data, true);
+          // Auto-send after transcript is set (use setTimeout to let React update state)
+          if (shouldSendAfter) {
+            setTimeout(() => onSendRef.current?.(), 0);
+          }
         }
       } else {
         onErrorRef.current?.(result.error);
@@ -161,6 +176,11 @@ export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResul
     setIsListening(false);
   }, []);
 
+  const stopListeningAndSend = useCallback(() => {
+    sendAfterTranscribeRef.current = true;
+    stopListening();
+  }, [stopListening]);
+
   const toggleListening = useCallback(() => {
     if (isListening) {
       stopListening();
@@ -188,6 +208,7 @@ export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResul
     shouldShowUI: isSupported && !isMobile && openAIKeySet,
     startListening: () => void startListening(),
     stopListening,
+    stopListeningAndSend,
     toggleListening,
   };
 }

From 53cefb3a91bb27a84ef6bddfd0c156d9449afae7 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Mon, 1 Dec 2025 23:46:01 -0600
Subject: [PATCH 06/18] style: reduce vertical space in chat controls

- Reduce gap between control rows from gap-1 to gap-0.5
- Reduce vertical wrap gap from gap-y-2 to gap-y-1
- Reduce send button padding from px-2 py-1 to px-1.5 py-0.5
---
 src/browser/components/ChatInput/index.tsx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/browser/components/ChatInput/index.tsx b/src/browser/components/ChatInput/index.tsx
index 309f09341b..1531711ef4 100644
--- a/src/browser/components/ChatInput/index.tsx
+++ b/src/browser/components/ChatInput/index.tsx
@@ -1086,7 +1086,7 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
           {/* Image attachments */}
           <ImageAttachments images={imageAttachments} onRemove={handleRemoveImage} />
 
-          <div className="flex flex-col gap-1" data-component="ChatModeToggles">
+          <div className="flex flex-col gap-0.5" data-component="ChatModeToggles">
             {/* Editing indicator - workspace only */}
             {variant === "workspace" && editingMessage && (
               <div className="text-edit-mode text-[11px] font-medium">
@@ -1094,7 +1094,7 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
               </div>
             )}
 
-            <div className="@container flex flex-wrap items-center gap-x-3 gap-y-2">
+            <div className="@container flex flex-wrap items-center gap-x-3 gap-y-1">
               {/* Model Selector - always visible */}
               <div
                 className="flex items-center"
@@ -1176,7 +1176,7 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
                     disabled={!canSend}
                     aria-label="Send message"
                     className={cn(
-                      "inline-flex items-center gap-1 rounded-sm border border-border-light px-2 py-1 text-[11px] font-medium text-white transition-colors duration-200 disabled:opacity-50",
+                      "inline-flex items-center gap-1 rounded-sm border border-border-light px-1.5 py-0.5 text-[11px] font-medium text-white transition-colors duration-200 disabled:opacity-50",
                       mode === "plan"
                         ? "bg-plan-mode hover:bg-plan-mode-hover disabled:hover:bg-plan-mode"
                         : "bg-exec-mode hover:bg-exec-mode-hover disabled:hover:bg-exec-mode"

From bc400fbd98bca0f024aa8ee11306346667e721ea Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Mon, 1 Dec 2025 23:47:02 -0600
Subject: [PATCH 07/18] style: make toggle group match send button size

- Change ToggleGroup padding from px-2 py-1 to px-1.5 py-0.5
- Keeps mode selector and send button visually consistent
---
 src/browser/components/ToggleGroup.tsx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/browser/components/ToggleGroup.tsx b/src/browser/components/ToggleGroup.tsx
index 8525c983c9..0f812f7b4c 100644
--- a/src/browser/components/ToggleGroup.tsx
+++ b/src/browser/components/ToggleGroup.tsx
@@ -30,7 +30,7 @@ export function ToggleGroup<T extends string>({
         onClick={() => onChange(nextOption.value)}
         type="button"
         className={cn(
-          "px-2 py-1 text-[11px] font-sans rounded-sm border-none cursor-pointer transition-all duration-150",
+          "px-1.5 py-0.5 text-[11px] font-sans rounded-sm border-none cursor-pointer transition-all duration-150",
           "text-toggle-text-active bg-toggle-active font-medium",
           activeOption?.activeClassName
         )}
@@ -52,7 +52,7 @@ export function ToggleGroup<T extends string>({
             aria-pressed={isActive}
             type="button"
             className={cn(
-              "px-2 py-1 text-[11px] font-sans rounded-sm border-none cursor-pointer transition-all duration-150 bg-transparent",
+              "px-1.5 py-0.5 text-[11px] font-sans rounded-sm border-none cursor-pointer transition-all duration-150 bg-transparent",
               isActive
                 ? "text-toggle-text-active bg-toggle-active font-medium"
                 : "text-toggle-text font-normal hover:text-toggle-text-hover hover:bg-toggle-hover",

From 4d9552d428cf677d1065ed727521d9757e8cd7df Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Mon, 1 Dec 2025 23:51:10 -0600
Subject: [PATCH 08/18] refactor: voice input cleanup and user education
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

User education:
- Show mic button even without OpenAI key (disabled with tooltip)
- Tooltip explains: 'Configure in Settings → Providers'
- Toast error when trying to use keybind/command without key

DRY improvements:
- Extract WaveformBars component for reusable animated bars
- Remove unused Web Speech API error message mappings

Code quality:
- Add isApiKeySet to hook result for explicit checking
- shouldShowUI now only checks platform support, not API key
- Verified no race conditions in hook logic
---
 .../components/ChatInput/VoiceInputButton.tsx | 35 +++++++---
 .../components/ChatInput/WaveformBars.tsx     | 32 +++++++++
 src/browser/components/ChatInput/index.tsx    | 67 ++++++++-----------
 src/browser/hooks/useVoiceInput.ts            |  8 ++-
 4 files changed, 91 insertions(+), 51 deletions(-)
 create mode 100644 src/browser/components/ChatInput/WaveformBars.tsx

diff --git a/src/browser/components/ChatInput/VoiceInputButton.tsx b/src/browser/components/ChatInput/VoiceInputButton.tsx
index 15548135bc..07103310f2 100644
--- a/src/browser/components/ChatInput/VoiceInputButton.tsx
+++ b/src/browser/components/ChatInput/VoiceInputButton.tsx
@@ -6,7 +6,8 @@
  * - Idle: Subtle gray mic icon
  * - Recording: Blue pulsing mic
  * - Transcribing: Amber spinning loader
- * - Hidden: When on mobile, unsupported, or no OpenAI key
+ * - Disabled (no API key): Subtle gray with explanatory tooltip
+ * - Hidden: When on mobile or unsupported
  */
 
 import React from "react";
@@ -19,22 +20,26 @@ interface VoiceInputButtonProps {
   isListening: boolean;
   isTranscribing: boolean;
   isSupported: boolean;
+  isApiKeySet: boolean;
   shouldShowUI: boolean;
   onToggle: () => void;
   disabled?: boolean;
 }
 
 export const VoiceInputButton: React.FC<VoiceInputButtonProps> = (props) => {
-  // Don't render if we shouldn't show UI (mobile, unsupported, or no OpenAI key)
+  // Don't render on mobile or unsupported platforms
   if (!props.shouldShowUI) {
     return null;
   }
 
-  const label = props.isTranscribing
-    ? "Transcribing..."
-    : props.isListening
-      ? "Stop recording"
-      : "Voice input";
+  const needsApiKey = !props.isApiKeySet;
+  const label = needsApiKey
+    ? "Voice input (requires OpenAI API key)"
+    : props.isTranscribing
+      ? "Transcribing..."
+      : props.isListening
+        ? "Stop recording"
+        : "Voice input";
 
   const Icon = props.isTranscribing ? Loader2 : Mic;
 
@@ -43,7 +48,9 @@ export const VoiceInputButton: React.FC<VoiceInputButtonProps> = (props) => {
       <button
         type="button"
         onClick={props.onToggle}
-        disabled={(props.disabled ?? false) || !props.isSupported || props.isTranscribing}
+        disabled={
+          (props.disabled ?? false) || !props.isSupported || props.isTranscribing || needsApiKey
+        }
         aria-label={label}
         aria-pressed={props.isListening}
         className={cn(
@@ -59,7 +66,17 @@ export const VoiceInputButton: React.FC<VoiceInputButtonProps> = (props) => {
         <Icon className={cn("h-4 w-4", props.isTranscribing && "animate-spin")} strokeWidth={1.5} />
       </button>
       <Tooltip className="tooltip" align="right">
-        {label} ({formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT)})
+        {needsApiKey ? (
+          <>
+            Voice input requires OpenAI API key.
+            <br />
+            Configure in Settings → Providers.
+          </>
+        ) : (
+          <>
+            {label} ({formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT)})
+          </>
+        )}
       </Tooltip>
     </TooltipWrapper>
   );
diff --git a/src/browser/components/ChatInput/WaveformBars.tsx b/src/browser/components/ChatInput/WaveformBars.tsx
new file mode 100644
index 0000000000..e8c834016b
--- /dev/null
+++ b/src/browser/components/ChatInput/WaveformBars.tsx
@@ -0,0 +1,32 @@
+/**
+ * Animated waveform bars for voice recording UI.
+ * Shows 5 bars with staggered pulse animation.
+ */
+
+import { cn } from "@/common/lib/utils";
+
+interface WaveformBarsProps {
+  /** Color class for the bars (e.g., "bg-blue-500") */
+  colorClass: string;
+  /** Whether to mirror the animation (for right-side waveform) */
+  mirrored?: boolean;
+}
+
+export const WaveformBars: React.FC<WaveformBarsProps> = (props) => {
+  const indices = props.mirrored ? [4, 3, 2, 1, 0] : [0, 1, 2, 3, 4];
+
+  return (
+    <div className="flex items-center gap-1">
+      {indices.map((i, displayIndex) => (
+        <div
+          key={displayIndex}
+          className={cn("w-1 rounded-full", props.colorClass)}
+          style={{
+            height: `${12 + Math.sin(i * 0.8) * 8}px`,
+            animation: `pulse 0.8s ease-in-out ${i * 0.1}s infinite alternate`,
+          }}
+        />
+      ))}
+    </div>
+  );
+};
diff --git a/src/browser/components/ChatInput/index.tsx b/src/browser/components/ChatInput/index.tsx
index 1531711ef4..303630546c 100644
--- a/src/browser/components/ChatInput/index.tsx
+++ b/src/browser/components/ChatInput/index.tsx
@@ -67,6 +67,7 @@ import { useCreationWorkspace } from "./useCreationWorkspace";
 import { useTutorial } from "@/browser/contexts/TutorialContext";
 import { useVoiceInput } from "@/browser/hooks/useVoiceInput";
 import { VoiceInputButton } from "./VoiceInputButton";
+import { WaveformBars } from "./WaveformBars";
 
 const LEADING_COMMAND_NOISE = /^(?:\s|\u200B|\u200C|\u200D|\u200E|\u200F|\uFEFF)+/;
 
@@ -173,17 +174,10 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
 
   const handleVoiceError = useCallback(
     (error: string) => {
-      // Map common errors to user-friendly messages
-      const errorMessages: Record<string, string> = {
-        "not-allowed": "Microphone access denied. Please allow microphone access and try again.",
-        "no-speech": "No speech detected. Please try again.",
-        network: "Network error. Please check your connection.",
-        "audio-capture": "No microphone found. Please connect a microphone.",
-      };
       setToast({
         id: Date.now().toString(),
         type: "error",
-        message: errorMessages[error] ?? `Voice input error: ${error}`,
+        message: error,
       });
     },
     [setToast]
@@ -506,12 +500,20 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
     if (!voiceInput.shouldShowUI) return;
 
     const handler = () => {
+      if (!voiceInput.isApiKeySet) {
+        setToast({
+          id: Date.now().toString(),
+          type: "error",
+          message: "Voice input requires OpenAI API key. Configure in Settings → Providers.",
+        });
+        return;
+      }
       voiceInput.toggleListening();
     };
     window.addEventListener(CUSTOM_EVENTS.TOGGLE_VOICE_INPUT, handler as EventListener);
     return () =>
       window.removeEventListener(CUSTOM_EVENTS.TOGGLE_VOICE_INPUT, handler as EventListener);
-  }, [voiceInput]);
+  }, [voiceInput, setToast]);
 
   // Auto-focus chat input when workspace changes (workspace only)
   const workspaceIdForFocus = variant === "workspace" ? props.workspaceId : null;
@@ -847,6 +849,14 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
     // Handle voice input toggle (Ctrl+D / Cmd+D)
     if (matchesKeybind(e, KEYBINDS.TOGGLE_VOICE_INPUT) && voiceInput.shouldShowUI) {
       e.preventDefault();
+      if (!voiceInput.isApiKeySet) {
+        setToast({
+          id: Date.now().toString(),
+          type: "error",
+          message: "Voice input requires OpenAI API key. Configure in Settings → Providers.",
+        });
+        return;
+      }
       voiceInput.toggleListening();
       return;
     }
@@ -1002,22 +1012,9 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
                 )}
                 aria-label={voiceInput.isListening ? "Stop recording" : "Transcribing..."}
               >
-                {/* Animated waveform bars */}
-                <div className="flex items-center gap-1">
-                  {[0, 1, 2, 3, 4].map((i) => (
-                    <div
-                      key={i}
-                      className={cn(
-                        "w-1 rounded-full",
-                        voiceInput.isListening ? "bg-blue-500" : "bg-amber-500"
-                      )}
-                      style={{
-                        height: `${12 + Math.sin(i * 0.8) * 8}px`,
-                        animation: `pulse 0.8s ease-in-out ${i * 0.1}s infinite alternate`,
-                      }}
-                    />
-                  ))}
-                </div>
+                <WaveformBars
+                  colorClass={voiceInput.isListening ? "bg-blue-500" : "bg-amber-500"}
+                />
                 <span
                   className={cn(
                     "text-sm font-medium",
@@ -1028,21 +1025,10 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
                     ? `Recording... space to send, ${formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT)} to stop`
                     : "Transcribing..."}
                 </span>
-                <div className="flex items-center gap-1">
-                  {[0, 1, 2, 3, 4].map((i) => (
-                    <div
-                      key={i}
-                      className={cn(
-                        "w-1 rounded-full",
-                        voiceInput.isListening ? "bg-blue-500" : "bg-amber-500"
-                      )}
-                      style={{
-                        height: `${12 + Math.sin((4 - i) * 0.8) * 8}px`,
-                        animation: `pulse 0.8s ease-in-out ${(4 - i) * 0.1}s infinite alternate`,
-                      }}
-                    />
-                  ))}
-                </div>
+                <WaveformBars
+                  colorClass={voiceInput.isListening ? "bg-blue-500" : "bg-amber-500"}
+                  mirrored
+                />
               </button>
             ) : (
               <>
@@ -1074,6 +1060,7 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
                     isListening={voiceInput.isListening}
                     isTranscribing={voiceInput.isTranscribing}
                     isSupported={voiceInput.isSupported}
+                    isApiKeySet={voiceInput.isApiKeySet}
                     shouldShowUI={voiceInput.shouldShowUI}
                     onToggle={voiceInput.toggleListening}
                     disabled={disabled || isSending}
diff --git a/src/browser/hooks/useVoiceInput.ts b/src/browser/hooks/useVoiceInput.ts
index d5375b522c..5f6c0d3670 100644
--- a/src/browser/hooks/useVoiceInput.ts
+++ b/src/browser/hooks/useVoiceInput.ts
@@ -42,7 +42,9 @@ export interface UseVoiceInputResult {
   isTranscribing: boolean;
   /** Whether the browser supports MediaRecorder */
   isSupported: boolean;
-  /** Whether we should show voice UI (supported, not mobile, API key set) */
+  /** Whether OpenAI API key is configured */
+  isApiKeySet: boolean;
+  /** Whether we should show voice UI (supported and not mobile) */
   shouldShowUI: boolean;
   /** Start recording for voice input */
   startListening: () => void;
@@ -205,7 +207,9 @@ export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResul
     isListening,
     isTranscribing,
     isSupported,
-    shouldShowUI: isSupported && !isMobile && openAIKeySet,
+    isApiKeySet: openAIKeySet,
+    // Show UI on supported desktop platforms (mobile has native dictation)
+    shouldShowUI: isSupported && !isMobile,
     startListening: () => void startListening(),
     stopListening,
     stopListeningAndSend,

From adcec7cfaa2e042e8d7f1d2120920b19066cd085 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Mon, 1 Dec 2025 23:55:07 -0600
Subject: [PATCH 09/18] refactor: clean up useVoiceInput with enum state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace isListening/isTranscribing booleans with single state enum
- Merge stopListening/stopListeningAndSend into stop(options?)
- Rename methods: startListening→start, toggleListening→toggle
- Consolidate callback refs into single callbacksRef object
- Move platform checks (isMobile, isSupported) to module scope
- Simplify VoiceInputButton with STATE_CONFIG lookup table
- Inline simple callbacks in ChatInput (no separate handlers)
---
 .../components/ChatInput/VoiceInputButton.tsx |  50 ++---
 src/browser/components/ChatInput/index.tsx    |  60 ++---
 src/browser/hooks/useVoiceInput.ts            | 207 +++++++-----------
 3 files changed, 115 insertions(+), 202 deletions(-)

diff --git a/src/browser/components/ChatInput/VoiceInputButton.tsx b/src/browser/components/ChatInput/VoiceInputButton.tsx
index 07103310f2..b4f27bede8 100644
--- a/src/browser/components/ChatInput/VoiceInputButton.tsx
+++ b/src/browser/components/ChatInput/VoiceInputButton.tsx
@@ -1,13 +1,6 @@
 /**
  * Voice input button - floats inside the chat input textarea.
  * Minimal footprint: just an icon that changes color based on state.
- *
- * Visual states:
- * - Idle: Subtle gray mic icon
- * - Recording: Blue pulsing mic
- * - Transcribing: Amber spinning loader
- * - Disabled (no API key): Subtle gray with explanatory tooltip
- * - Hidden: When on mobile or unsupported
  */
 
 import React from "react";
@@ -15,55 +8,48 @@ import { Mic, Loader2 } from "lucide-react";
 import { TooltipWrapper, Tooltip } from "../Tooltip";
 import { formatKeybind, KEYBINDS } from "@/browser/utils/ui/keybinds";
 import { cn } from "@/common/lib/utils";
+import type { VoiceInputState } from "@/browser/hooks/useVoiceInput";
 
 interface VoiceInputButtonProps {
-  isListening: boolean;
-  isTranscribing: boolean;
-  isSupported: boolean;
+  state: VoiceInputState;
   isApiKeySet: boolean;
   shouldShowUI: boolean;
   onToggle: () => void;
   disabled?: boolean;
 }
 
+const STATE_CONFIG: Record<VoiceInputState, { label: string; colorClass: string }> = {
+  idle: { label: "Voice input", colorClass: "text-muted/50 hover:text-muted" },
+  recording: { label: "Stop recording", colorClass: "text-blue-500 animate-pulse" },
+  transcribing: { label: "Transcribing...", colorClass: "text-amber-500" },
+};
+
 export const VoiceInputButton: React.FC<VoiceInputButtonProps> = (props) => {
-  // Don't render on mobile or unsupported platforms
-  if (!props.shouldShowUI) {
-    return null;
-  }
+  if (!props.shouldShowUI) return null;
 
   const needsApiKey = !props.isApiKeySet;
-  const label = needsApiKey
-    ? "Voice input (requires OpenAI API key)"
-    : props.isTranscribing
-      ? "Transcribing..."
-      : props.isListening
-        ? "Stop recording"
-        : "Voice input";
+  const { label, colorClass } = needsApiKey
+    ? { label: "Voice input (requires OpenAI API key)", colorClass: "text-muted/50" }
+    : STATE_CONFIG[props.state];
 
-  const Icon = props.isTranscribing ? Loader2 : Mic;
+  const Icon = props.state === "transcribing" ? Loader2 : Mic;
+  const isTranscribing = props.state === "transcribing";
 
   return (
     <TooltipWrapper inline>
       <button
         type="button"
         onClick={props.onToggle}
-        disabled={
-          (props.disabled ?? false) || !props.isSupported || props.isTranscribing || needsApiKey
-        }
+        disabled={(props.disabled ?? false) || isTranscribing || needsApiKey}
         aria-label={label}
-        aria-pressed={props.isListening}
+        aria-pressed={props.state === "recording"}
         className={cn(
           "inline-flex items-center justify-center rounded p-0.5 transition-colors duration-150",
           "disabled:cursor-not-allowed disabled:opacity-40",
-          props.isTranscribing
-            ? "text-amber-500"
-            : props.isListening
-              ? "text-blue-500 animate-pulse"
-              : "text-muted/50 hover:text-muted"
+          colorClass
         )}
       >
-        <Icon className={cn("h-4 w-4", props.isTranscribing && "animate-spin")} strokeWidth={1.5} />
+        <Icon className={cn("h-4 w-4", isTranscribing && "animate-spin")} strokeWidth={1.5} />
       </button>
       <Tooltip className="tooltip" align="right">
         {needsApiKey ? (
diff --git a/src/browser/components/ChatInput/index.tsx b/src/browser/components/ChatInput/index.tsx
index 303630546c..7c733e013b 100644
--- a/src/browser/components/ChatInput/index.tsx
+++ b/src/browser/components/ChatInput/index.tsx
@@ -160,32 +160,17 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
   // Track if OpenAI API key is configured for voice input
   const [openAIKeySet, setOpenAIKeySet] = useState(false);
 
-  // Voice input handling - appends transcribed text to input
-  const handleVoiceTranscript = useCallback(
-    (text: string, _isFinal: boolean) => {
-      // Whisper only returns final results, append to input with space separator if needed
+  // Voice input - appends transcribed text to input
+  const voiceInput = useVoiceInput({
+    onTranscript: (text) => {
       setInput((prev) => {
         const separator = prev.length > 0 && !prev.endsWith(" ") ? " " : "";
         return prev + separator + text;
       });
     },
-    [setInput]
-  );
-
-  const handleVoiceError = useCallback(
-    (error: string) => {
-      setToast({
-        id: Date.now().toString(),
-        type: "error",
-        message: error,
-      });
+    onError: (error) => {
+      setToast({ id: Date.now().toString(), type: "error", message: error });
     },
-    [setToast]
-  );
-
-  const voiceInput = useVoiceInput({
-    onTranscript: handleVoiceTranscript,
-    onError: handleVoiceError,
     onSend: () => void handleSend(),
     openAIKeySet,
   });
@@ -508,7 +493,7 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
         });
         return;
       }
-      voiceInput.toggleListening();
+      voiceInput.toggle();
     };
     window.addEventListener(CUSTOM_EVENTS.TOGGLE_VOICE_INPUT, handler as EventListener);
     return () =>
@@ -857,7 +842,7 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
         });
         return;
       }
-      voiceInput.toggleListening();
+      voiceInput.toggle();
       return;
     }
 
@@ -990,43 +975,42 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
           />
 
           <div className="relative flex items-end" data-component="ChatInputControls">
-            {/* Recording/transcribing overlay - dramatically replaces textarea */}
-            {voiceInput.isListening || voiceInput.isTranscribing ? (
+            {/* Recording/transcribing overlay - replaces textarea when active */}
+            {voiceInput.state !== "idle" ? (
               <button
                 type="button"
                 ref={(el) => el?.focus()}
-                onClick={voiceInput.isListening ? voiceInput.toggleListening : undefined}
+                onClick={voiceInput.state === "recording" ? voiceInput.toggle : undefined}
                 onKeyDown={(e) => {
-                  // Space stops recording and sends immediately after transcription
-                  if (e.key === " " && voiceInput.isListening) {
+                  if (e.key === " " && voiceInput.state === "recording") {
                     e.preventDefault();
-                    voiceInput.stopListeningAndSend();
+                    voiceInput.stop({ send: true });
                   }
                 }}
-                disabled={voiceInput.isTranscribing}
+                disabled={voiceInput.state === "transcribing"}
                 className={cn(
                   "mb-1 flex min-h-[60px] w-full items-center justify-center gap-3 rounded-md border px-4 py-4 transition-all",
-                  voiceInput.isListening
+                  voiceInput.state === "recording"
                     ? "cursor-pointer border-blue-500 bg-blue-500/10"
                     : "cursor-wait border-amber-500 bg-amber-500/10"
                 )}
-                aria-label={voiceInput.isListening ? "Stop recording" : "Transcribing..."}
+                aria-label={voiceInput.state === "recording" ? "Stop recording" : "Transcribing..."}
               >
                 <WaveformBars
-                  colorClass={voiceInput.isListening ? "bg-blue-500" : "bg-amber-500"}
+                  colorClass={voiceInput.state === "recording" ? "bg-blue-500" : "bg-amber-500"}
                 />
                 <span
                   className={cn(
                     "text-sm font-medium",
-                    voiceInput.isListening ? "text-blue-500" : "text-amber-500"
+                    voiceInput.state === "recording" ? "text-blue-500" : "text-amber-500"
                   )}
                 >
-                  {voiceInput.isListening
+                  {voiceInput.state === "recording"
                     ? `Recording... space to send, ${formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT)} to stop`
                     : "Transcribing..."}
                 </span>
                 <WaveformBars
-                  colorClass={voiceInput.isListening ? "bg-blue-500" : "bg-amber-500"}
+                  colorClass={voiceInput.state === "recording" ? "bg-blue-500" : "bg-amber-500"}
                   mirrored
                 />
               </button>
@@ -1057,12 +1041,10 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
                 {/* Floating voice input button inside textarea */}
                 <div className="absolute right-2 bottom-2">
                   <VoiceInputButton
-                    isListening={voiceInput.isListening}
-                    isTranscribing={voiceInput.isTranscribing}
-                    isSupported={voiceInput.isSupported}
+                    state={voiceInput.state}
                     isApiKeySet={voiceInput.isApiKeySet}
                     shouldShowUI={voiceInput.shouldShowUI}
-                    onToggle={voiceInput.toggleListening}
+                    onToggle={voiceInput.toggle}
                     disabled={disabled || isSending}
                   />
                 </div>
diff --git a/src/browser/hooks/useVoiceInput.ts b/src/browser/hooks/useVoiceInput.ts
index 5f6c0d3670..f196051f08 100644
--- a/src/browser/hooks/useVoiceInput.ts
+++ b/src/browser/hooks/useVoiceInput.ts
@@ -1,218 +1,163 @@
 /**
  * Hook for voice input using OpenAI Whisper API via MediaRecorder.
  *
- * Features:
- * - Records audio using MediaRecorder (webm/opus format)
- * - Sends to backend which calls OpenAI Whisper for transcription
- * - Shows recording state while capturing
- * - Shows transcribing state while processing
- * - Hidden on mobile (native keyboards have built-in dictation)
- * - Disabled when OpenAI API key not configured
+ * Records audio, sends to backend for Whisper transcription, returns text.
+ * Hidden on mobile (native keyboards have built-in dictation).
  */
 
 import { useState, useCallback, useRef, useEffect } from "react";
 
-// Check if we're on a mobile device (touch-based)
-function isMobileDevice(): boolean {
-  if (typeof window === "undefined") return false;
-  // Check for touch capability and small screen as heuristics
-  return ("ontouchstart" in window || navigator.maxTouchPoints > 0) && window.innerWidth < 768;
-}
-
-// Check if MediaRecorder is available
-function isMediaRecorderSupported(): boolean {
-  return typeof window !== "undefined" && typeof MediaRecorder !== "undefined";
-}
+export type VoiceInputState = "idle" | "recording" | "transcribing";
 
 export interface UseVoiceInputOptions {
-  /** Called when transcript text is received */
-  onTranscript: (text: string, isFinal: boolean) => void;
-  /** Called when an error occurs */
+  onTranscript: (text: string) => void;
   onError?: (error: string) => void;
-  /** Called to send the message (used by stopListeningAndSend) */
   onSend?: () => void;
-  /** Whether OpenAI API key is configured */
   openAIKeySet: boolean;
 }
 
 export interface UseVoiceInputResult {
-  /** Whether voice input is currently recording */
-  isListening: boolean;
-  /** Whether transcription is in progress */
-  isTranscribing: boolean;
-  /** Whether the browser supports MediaRecorder */
+  state: VoiceInputState;
   isSupported: boolean;
-  /** Whether OpenAI API key is configured */
   isApiKeySet: boolean;
-  /** Whether we should show voice UI (supported and not mobile) */
+  /** Show UI on supported desktop platforms (mobile has native dictation) */
   shouldShowUI: boolean;
-  /** Start recording for voice input */
-  startListening: () => void;
-  /** Stop recording and transcribe */
-  stopListening: () => void;
-  /** Stop recording, transcribe, and send when done */
-  stopListeningAndSend: () => void;
-  /** Toggle recording state */
-  toggleListening: () => void;
+  start: () => void;
+  stop: (options?: { send?: boolean }) => void;
+  toggle: () => void;
 }
 
-export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResult {
-  const { onTranscript, onError, onSend, openAIKeySet } = options;
+// Platform checks (evaluated once)
+const isMobile =
+  typeof window !== "undefined" &&
+  ("ontouchstart" in window || navigator.maxTouchPoints > 0) &&
+  window.innerWidth < 768;
 
-  const [isListening, setIsListening] = useState(false);
-  const [isTranscribing, setIsTranscribing] = useState(false);
+const isSupported = typeof window !== "undefined" && typeof MediaRecorder !== "undefined";
+
+export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResult {
+  const [state, setState] = useState<VoiceInputState>("idle");
 
   const mediaRecorderRef = useRef<MediaRecorder | null>(null);
   const audioChunksRef = useRef<Blob[]>([]);
   const streamRef = useRef<MediaStream | null>(null);
-  // Flag to auto-send after transcription completes
   const sendAfterTranscribeRef = useRef(false);
 
-  const isSupported = isMediaRecorderSupported();
-  const isMobile = isMobileDevice();
-
-  // Store callbacks in refs to avoid recreating on every render
-  const onTranscriptRef = useRef(onTranscript);
-  const onErrorRef = useRef(onError);
-  const onSendRef = useRef(onSend);
+  // Store callbacks in refs to avoid stale closures
+  const callbacksRef = useRef(options);
   useEffect(() => {
-    onTranscriptRef.current = onTranscript;
-    onErrorRef.current = onError;
-    onSendRef.current = onSend;
-  }, [onTranscript, onError, onSend]);
-
-  const transcribeAudio = useCallback(async (audioBlob: Blob) => {
-    setIsTranscribing(true);
-    const shouldSendAfter = sendAfterTranscribeRef.current;
+    callbacksRef.current = options;
+  }, [options]);
+
+  const transcribe = useCallback(async (audioBlob: Blob) => {
+    setState("transcribing");
+    const shouldSend = sendAfterTranscribeRef.current;
     sendAfterTranscribeRef.current = false;
 
     try {
-      // Convert blob to base64
       const arrayBuffer = await audioBlob.arrayBuffer();
       const base64 = btoa(
         new Uint8Array(arrayBuffer).reduce((data, byte) => data + String.fromCharCode(byte), "")
       );
 
-      // Call backend to transcribe
       const result = await window.api.voice.transcribe(base64);
 
-      if (result.success) {
-        if (result.data.trim()) {
-          onTranscriptRef.current(result.data, true);
-          // Auto-send after transcript is set (use setTimeout to let React update state)
-          if (shouldSendAfter) {
-            setTimeout(() => onSendRef.current?.(), 0);
-          }
+      if (result.success && result.data.trim()) {
+        callbacksRef.current.onTranscript(result.data);
+        if (shouldSend) {
+          setTimeout(() => callbacksRef.current.onSend?.(), 0);
         }
-      } else {
-        onErrorRef.current?.(result.error);
+      } else if (!result.success) {
+        callbacksRef.current.onError?.(result.error);
       }
     } catch (err) {
       const message = err instanceof Error ? err.message : String(err);
-      onErrorRef.current?.(`Transcription failed: ${message}`);
+      callbacksRef.current.onError?.(`Transcription failed: ${message}`);
     } finally {
-      setIsTranscribing(false);
+      setState("idle");
     }
   }, []);
 
-  const startListening = useCallback(async () => {
-    if (!isSupported || isListening || isTranscribing || !openAIKeySet) return;
+  const start = useCallback(async () => {
+    if (!isSupported || state !== "idle" || !callbacksRef.current.openAIKeySet) return;
 
     try {
       const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
       streamRef.current = stream;
 
-      // Use webm/opus which is well supported and works with Whisper
       const mimeType = MediaRecorder.isTypeSupported("audio/webm;codecs=opus")
         ? "audio/webm;codecs=opus"
         : "audio/webm";
 
-      const mediaRecorder = new MediaRecorder(stream, { mimeType });
+      const recorder = new MediaRecorder(stream, { mimeType });
       audioChunksRef.current = [];
 
-      mediaRecorder.ondataavailable = (event) => {
-        if (event.data.size > 0) {
-          audioChunksRef.current.push(event.data);
-        }
+      recorder.ondataavailable = (e) => {
+        if (e.data.size > 0) audioChunksRef.current.push(e.data);
       };
 
-      mediaRecorder.onstop = () => {
-        const audioBlob = new Blob(audioChunksRef.current, { type: mimeType });
+      recorder.onstop = () => {
+        const blob = new Blob(audioChunksRef.current, { type: mimeType });
         audioChunksRef.current = [];
-
-        // Stop all tracks to release microphone
-        stream.getTracks().forEach((track) => track.stop());
+        stream.getTracks().forEach((t) => t.stop());
         streamRef.current = null;
-
-        // Transcribe the audio
-        void transcribeAudio(audioBlob);
+        void transcribe(blob);
       };
 
-      mediaRecorder.onerror = () => {
-        onErrorRef.current?.("Recording failed");
-        setIsListening(false);
-        stream.getTracks().forEach((track) => track.stop());
+      recorder.onerror = () => {
+        callbacksRef.current.onError?.("Recording failed");
+        setState("idle");
+        stream.getTracks().forEach((t) => t.stop());
         streamRef.current = null;
       };
 
-      mediaRecorderRef.current = mediaRecorder;
-      mediaRecorder.start();
-      setIsListening(true);
+      mediaRecorderRef.current = recorder;
+      recorder.start();
+      setState("recording");
     } catch (err) {
       const message = err instanceof Error ? err.message : String(err);
-      if (message.includes("Permission denied") || message.includes("NotAllowedError")) {
-        onErrorRef.current?.(
-          "Microphone access denied. Please allow microphone access and try again."
-        );
-      } else {
-        onErrorRef.current?.(`Failed to start recording: ${message}`);
-      }
+      const isPermissionError =
+        message.includes("Permission denied") || message.includes("NotAllowedError");
+      callbacksRef.current.onError?.(
+        isPermissionError
+          ? "Microphone access denied. Please allow microphone access and try again."
+          : `Failed to start recording: ${message}`
+      );
     }
-  }, [isSupported, isListening, isTranscribing, openAIKeySet, transcribeAudio]);
+  }, [state, transcribe]);
 
-  const stopListening = useCallback(() => {
-    if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") {
-      mediaRecorderRef.current.stop();
+  const stop = useCallback((options?: { send?: boolean }) => {
+    if (options?.send) sendAfterTranscribeRef.current = true;
+    if (mediaRecorderRef.current?.state !== "inactive") {
+      mediaRecorderRef.current?.stop();
       mediaRecorderRef.current = null;
     }
-    setIsListening(false);
+    // Note: setState("idle") not called here - transcribe() handles transition
   }, []);
 
-  const stopListeningAndSend = useCallback(() => {
-    sendAfterTranscribeRef.current = true;
-    stopListening();
-  }, [stopListening]);
-
-  const toggleListening = useCallback(() => {
-    if (isListening) {
-      stopListening();
-    } else {
-      void startListening();
+  const toggle = useCallback(() => {
+    if (state === "recording") {
+      stop();
+    } else if (state === "idle") {
+      void start();
     }
-  }, [isListening, startListening, stopListening]);
+  }, [state, start, stop]);
 
   // Cleanup on unmount
   useEffect(() => {
     return () => {
-      if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") {
-        mediaRecorderRef.current.stop();
-      }
-      if (streamRef.current) {
-        streamRef.current.getTracks().forEach((track) => track.stop());
-      }
+      mediaRecorderRef.current?.stop();
+      streamRef.current?.getTracks().forEach((t) => t.stop());
     };
   }, []);
 
   return {
-    isListening,
-    isTranscribing,
+    state,
     isSupported,
-    isApiKeySet: openAIKeySet,
-    // Show UI on supported desktop platforms (mobile has native dictation)
+    isApiKeySet: callbacksRef.current.openAIKeySet,
     shouldShowUI: isSupported && !isMobile,
-    startListening: () => void startListening(),
-    stopListening,
-    stopListeningAndSend,
-    toggleListening,
+    start: () => void start(),
+    stop,
+    toggle,
   };
 }

From 944e34143abe60c51a418845f316d678e75fd552 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Mon, 1 Dec 2025 23:59:58 -0600
Subject: [PATCH 10/18] feat: space on empty input starts voice, escape cancels

- Pressing space on empty chat input starts voice recording
  (convenient alternative to Cmd+D)
- Pressing escape during recording cancels without transcribing
- Add cancel() method to voice hook that sets flag to skip transcribe
- Updated overlay text to show all shortcuts
---
 src/browser/components/ChatInput/index.tsx | 18 +++++++++++++++++-
 src/browser/hooks/useVoiceInput.ts         | 21 +++++++++++++++++++--
 2 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/src/browser/components/ChatInput/index.tsx b/src/browser/components/ChatInput/index.tsx
index 7c733e013b..b51aea6050 100644
--- a/src/browser/components/ChatInput/index.tsx
+++ b/src/browser/components/ChatInput/index.tsx
@@ -846,6 +846,19 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
       return;
     }
 
+    // Space on empty input starts voice recording
+    if (
+      e.key === " " &&
+      input.trim() === "" &&
+      voiceInput.shouldShowUI &&
+      voiceInput.isApiKeySet &&
+      voiceInput.state === "idle"
+    ) {
+      e.preventDefault();
+      voiceInput.start();
+      return;
+    }
+
     // Handle open model selector
     if (matchesKeybind(e, KEYBINDS.OPEN_MODEL_SELECTOR)) {
       e.preventDefault();
@@ -985,6 +998,9 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
                   if (e.key === " " && voiceInput.state === "recording") {
                     e.preventDefault();
                     voiceInput.stop({ send: true });
+                  } else if (e.key === "Escape" && voiceInput.state === "recording") {
+                    e.preventDefault();
+                    voiceInput.cancel();
                   }
                 }}
                 disabled={voiceInput.state === "transcribing"}
@@ -1006,7 +1022,7 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
                   )}
                 >
                   {voiceInput.state === "recording"
-                    ? `Recording... space to send, ${formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT)} to stop`
+                    ? `Recording... space to send, ${formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT)} to stop, esc to cancel`
                     : "Transcribing..."}
                 </span>
                 <WaveformBars
diff --git a/src/browser/hooks/useVoiceInput.ts b/src/browser/hooks/useVoiceInput.ts
index f196051f08..85a96fc38b 100644
--- a/src/browser/hooks/useVoiceInput.ts
+++ b/src/browser/hooks/useVoiceInput.ts
@@ -24,6 +24,8 @@ export interface UseVoiceInputResult {
   shouldShowUI: boolean;
   start: () => void;
   stop: (options?: { send?: boolean }) => void;
+  /** Cancel recording without transcribing (discard audio) */
+  cancel: () => void;
   toggle: () => void;
 }
 
@@ -42,6 +44,7 @@ export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResul
   const audioChunksRef = useRef<Blob[]>([]);
   const streamRef = useRef<MediaStream | null>(null);
   const sendAfterTranscribeRef = useRef(false);
+  const cancelledRef = useRef(false);
 
   // Store callbacks in refs to avoid stale closures
   const callbacksRef = useRef(options);
@@ -97,11 +100,17 @@ export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResul
       };
 
       recorder.onstop = () => {
+        const wasCancelled = cancelledRef.current;
+        cancelledRef.current = false;
         const blob = new Blob(audioChunksRef.current, { type: mimeType });
         audioChunksRef.current = [];
         stream.getTracks().forEach((t) => t.stop());
         streamRef.current = null;
-        void transcribe(blob);
+        if (wasCancelled) {
+          setState("idle");
+        } else {
+          void transcribe(blob);
+        }
       };
 
       recorder.onerror = () => {
@@ -132,7 +141,14 @@ export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResul
       mediaRecorderRef.current?.stop();
       mediaRecorderRef.current = null;
     }
-    // Note: setState("idle") not called here - transcribe() handles transition
+  }, []);
+
+  const cancel = useCallback(() => {
+    cancelledRef.current = true;
+    if (mediaRecorderRef.current?.state !== "inactive") {
+      mediaRecorderRef.current?.stop();
+      mediaRecorderRef.current = null;
+    }
   }, []);
 
   const toggle = useCallback(() => {
@@ -158,6 +174,7 @@ export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResul
     shouldShowUI: isSupported && !isMobile,
     start: () => void start(),
     stop,
+    cancel,
     toggle,
   };
 }

From f82b49f32d041087acc16a5a15d97f80718341dd Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Tue, 2 Dec 2025 00:01:39 -0600
Subject: [PATCH 11/18] fix: remove ugly focus ring, improve voice tooltip

- Add focus:outline-none to recording overlay button
- Update tooltip to document all shortcuts:
  - Space on empty input to start
  - Cmd+D anytime to toggle
  - Space during recording to send
  - Escape to cancel
---
 src/browser/components/ChatInput/VoiceInputButton.tsx | 7 ++++++-
 src/browser/components/ChatInput/index.tsx            | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/browser/components/ChatInput/VoiceInputButton.tsx b/src/browser/components/ChatInput/VoiceInputButton.tsx
index b4f27bede8..228d330b5d 100644
--- a/src/browser/components/ChatInput/VoiceInputButton.tsx
+++ b/src/browser/components/ChatInput/VoiceInputButton.tsx
@@ -60,7 +60,12 @@ export const VoiceInputButton: React.FC<VoiceInputButtonProps> = (props) => {
           </>
         ) : (
           <>
-            {label} ({formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT)})
+            <strong>Voice input</strong> — press space on empty input
+            <br />
+            or {formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT)} anytime
+            <br />
+            <br />
+            While recording: space sends, esc cancels
           </>
         )}
       </Tooltip>
diff --git a/src/browser/components/ChatInput/index.tsx b/src/browser/components/ChatInput/index.tsx
index b51aea6050..c6e1b0c04c 100644
--- a/src/browser/components/ChatInput/index.tsx
+++ b/src/browser/components/ChatInput/index.tsx
@@ -1005,7 +1005,7 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
                 }}
                 disabled={voiceInput.state === "transcribing"}
                 className={cn(
-                  "mb-1 flex min-h-[60px] w-full items-center justify-center gap-3 rounded-md border px-4 py-4 transition-all",
+                  "mb-1 flex min-h-[60px] w-full items-center justify-center gap-3 rounded-md border px-4 py-4 transition-all focus:outline-none",
                   voiceInput.state === "recording"
                     ? "cursor-pointer border-blue-500 bg-blue-500/10"
                     : "cursor-wait border-amber-500 bg-amber-500/10"

From 91c65e79d8c631431bd98c574ed62eff64bdbcae Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Tue, 2 Dec 2025 00:02:48 -0600
Subject: [PATCH 12/18] fix: use gpt-4o-transcribe model instead of whisper-1

---
 src/node/services/ipcMain.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/node/services/ipcMain.ts b/src/node/services/ipcMain.ts
index 9fa51fb784..ebd89108bc 100644
--- a/src/node/services/ipcMain.ts
+++ b/src/node/services/ipcMain.ts
@@ -661,7 +661,7 @@ export class IpcMain {
           // Call Whisper API
           const transcription = await client.audio.transcriptions.create({
             file: audioFile,
-            model: "whisper-1",
+            model: "gpt-4o-transcribe",
           });
 
           return Ok(transcription.text);

From 92e31f3af94bbd4110059b3abcc0f7a4c111fc11 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Tue, 2 Dec 2025 00:04:08 -0600
Subject: [PATCH 13/18] fix: global keybinds during recording work regardless
 of focus

- Add window-level keydown listener active only during recording
- Space and Escape work even if overlay button loses focus
- Removed redundant local onKeyDown and auto-focus from button
---
 src/browser/components/ChatInput/index.tsx | 27 ++++++++++++++--------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/src/browser/components/ChatInput/index.tsx b/src/browser/components/ChatInput/index.tsx
index c6e1b0c04c..f6b8814b61 100644
--- a/src/browser/components/ChatInput/index.tsx
+++ b/src/browser/components/ChatInput/index.tsx
@@ -500,6 +500,23 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
       window.removeEventListener(CUSTOM_EVENTS.TOGGLE_VOICE_INPUT, handler as EventListener);
   }, [voiceInput, setToast]);
 
+  // Global keybinds during recording (work regardless of focus)
+  useEffect(() => {
+    if (voiceInput.state !== "recording") return;
+
+    const handler = (e: KeyboardEvent) => {
+      if (e.key === " ") {
+        e.preventDefault();
+        voiceInput.stop({ send: true });
+      } else if (e.key === "Escape") {
+        e.preventDefault();
+        voiceInput.cancel();
+      }
+    };
+    window.addEventListener("keydown", handler);
+    return () => window.removeEventListener("keydown", handler);
+  }, [voiceInput]);
+
   // Auto-focus chat input when workspace changes (workspace only)
   const workspaceIdForFocus = variant === "workspace" ? props.workspaceId : null;
   useEffect(() => {
@@ -992,17 +1009,7 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
             {voiceInput.state !== "idle" ? (
               <button
                 type="button"
-                ref={(el) => el?.focus()}
                 onClick={voiceInput.state === "recording" ? voiceInput.toggle : undefined}
-                onKeyDown={(e) => {
-                  if (e.key === " " && voiceInput.state === "recording") {
-                    e.preventDefault();
-                    voiceInput.stop({ send: true });
-                  } else if (e.key === "Escape" && voiceInput.state === "recording") {
-                    e.preventDefault();
-                    voiceInput.cancel();
-                  }
-                }}
                 disabled={voiceInput.state === "transcribing"}
                 className={cn(
                   "mb-1 flex min-h-[60px] w-full items-center justify-center gap-3 rounded-md border px-4 py-4 transition-all focus:outline-none",

From 5d71600cae928a122f1440bb38e9d2e6e6f2d322 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Tue, 2 Dec 2025 00:06:51 -0600
Subject: [PATCH 14/18] test: add Storybook story for voice input without API
 key

- Add providersConfig option to setupSimpleChatStory helper
- Add VoiceInputNoApiKey story showing disabled mic with tooltip
- Documents user education for missing OpenAI key
---
 src/browser/stories/App.chat.stories.tsx | 26 ++++++++++++++++++++++++
 src/browser/stories/storyHelpers.ts      |  2 ++
 2 files changed, 28 insertions(+)

diff --git a/src/browser/stories/App.chat.stories.tsx b/src/browser/stories/App.chat.stories.tsx
index 9aa03e0a2e..295d9e5232 100644
--- a/src/browser/stories/App.chat.stories.tsx
+++ b/src/browser/stories/App.chat.stories.tsx
@@ -212,6 +212,32 @@ export const WithAgentStatus: AppStory = {
   ),
 };
 
+/** Voice input button shows user education when OpenAI API key is not set */
+export const VoiceInputNoApiKey: AppStory = {
+  render: () => (
+    <AppWithMocks
+      setup={() => {
+        setupSimpleChatStory({
+          messages: [],
+          // No OpenAI key configured - voice button should be disabled with tooltip
+          providersConfig: {
+            anthropic: { apiKeySet: true },
+            // openai deliberately missing
+          },
+        });
+      }}
+    />
+  ),
+  parameters: {
+    docs: {
+      description: {
+        story:
+          "Shows the voice input button in disabled state when OpenAI API key is not configured. Hover over the mic icon in the chat input to see the user education tooltip.",
+      },
+    },
+  },
+};
+
 /** Streaming/working state with pending tool call */
 export const Streaming: AppStory = {
   render: () => (
diff --git a/src/browser/stories/storyHelpers.ts b/src/browser/stories/storyHelpers.ts
index d35544ba79..3b555e69c3 100644
--- a/src/browser/stories/storyHelpers.ts
+++ b/src/browser/stories/storyHelpers.ts
@@ -55,6 +55,7 @@ export interface SimpleChatSetupOptions {
   projectName?: string;
   messages: MuxMessage[];
   gitStatus?: GitStatusFixture;
+  providersConfig?: Record<string, { apiKeySet: boolean; baseUrl?: string; models?: string[] }>;
 }
 
 /**
@@ -82,6 +83,7 @@ export function setupSimpleChatStory(opts: SimpleChatSetupOptions): void {
       workspaces,
       chatHandlers,
       gitStatus,
+      providersConfig: opts.providersConfig,
     })
   );
 

From fe15e4d1d9571b57ac417489c0b6d107ce36db01 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Tue, 2 Dec 2025 00:11:24 -0600
Subject: [PATCH 15/18] fix: add defense-in-depth mobile check in voice start()

Ensures start() is a no-op on mobile even if somehow called
directly, complementing the UI-layer shouldShowUI guards.
---
 src/browser/hooks/useVoiceInput.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/browser/hooks/useVoiceInput.ts b/src/browser/hooks/useVoiceInput.ts
index 85a96fc38b..8570be8843 100644
--- a/src/browser/hooks/useVoiceInput.ts
+++ b/src/browser/hooks/useVoiceInput.ts
@@ -82,7 +82,7 @@ export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResul
   }, []);
 
   const start = useCallback(async () => {
-    if (!isSupported || state !== "idle" || !callbacksRef.current.openAIKeySet) return;
+    if (!isSupported || isMobile || state !== "idle" || !callbacksRef.current.openAIKeySet) return;
 
     try {
       const stream = await navigator.mediaDevices.getUserMedia({ audio: true });

From 8c56c9c446b540c1f002f6c73a023a752fc6fba0 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Tue, 2 Dec 2025 00:12:04 -0600
Subject: [PATCH 16/18] refactor: consolidate voice input useEffects

Merge command palette toggle and global recording keybinds into
single useEffect with shared cleanup.
---
 src/browser/components/ChatInput/index.tsx | 27 +++++++++++-----------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/browser/components/ChatInput/index.tsx b/src/browser/components/ChatInput/index.tsx
index f6b8814b61..0d6e215390 100644
--- a/src/browser/components/ChatInput/index.tsx
+++ b/src/browser/components/ChatInput/index.tsx
@@ -480,11 +480,11 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
       window.removeEventListener(CUSTOM_EVENTS.THINKING_LEVEL_TOAST, handler as EventListener);
   }, [variant, props, setToast]);
 
-  // Listen for voice input toggle from command palette
+  // Voice input: command palette toggle + global recording keybinds
   useEffect(() => {
     if (!voiceInput.shouldShowUI) return;
 
-    const handler = () => {
+    const handleToggle = () => {
       if (!voiceInput.isApiKeySet) {
         setToast({
           id: Date.now().toString(),
@@ -495,16 +495,10 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
       }
       voiceInput.toggle();
     };
-    window.addEventListener(CUSTOM_EVENTS.TOGGLE_VOICE_INPUT, handler as EventListener);
-    return () =>
-      window.removeEventListener(CUSTOM_EVENTS.TOGGLE_VOICE_INPUT, handler as EventListener);
-  }, [voiceInput, setToast]);
-
-  // Global keybinds during recording (work regardless of focus)
-  useEffect(() => {
-    if (voiceInput.state !== "recording") return;
 
-    const handler = (e: KeyboardEvent) => {
+    // Global keybinds only active during recording
+    const handleKeyDown = (e: KeyboardEvent) => {
+      if (voiceInput.state !== "recording") return;
       if (e.key === " ") {
         e.preventDefault();
         voiceInput.stop({ send: true });
@@ -513,9 +507,14 @@ export const ChatInput: React.FC<ChatInputProps> = (props) => {
         voiceInput.cancel();
       }
     };
-    window.addEventListener("keydown", handler);
-    return () => window.removeEventListener("keydown", handler);
-  }, [voiceInput]);
+
+    window.addEventListener(CUSTOM_EVENTS.TOGGLE_VOICE_INPUT, handleToggle as EventListener);
+    window.addEventListener("keydown", handleKeyDown);
+    return () => {
+      window.removeEventListener(CUSTOM_EVENTS.TOGGLE_VOICE_INPUT, handleToggle as EventListener);
+      window.removeEventListener("keydown", handleKeyDown);
+    };
+  }, [voiceInput, setToast]);
 
   // Auto-focus chat input when workspace changes (workspace only)
   const workspaceIdForFocus = variant === "workspace" ? props.workspaceId : null;

From 4ffcec94faf62971e826cfb9d002db6c3cec86d4 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Tue, 2 Dec 2025 00:15:14 -0600
Subject: [PATCH 17/18] refactor: improve useVoiceInput clarity and touch
 detection

- Rename isMobile to HAS_TOUCH_DICTATION with clear doc comment
- Remove screen size check (iPads have dictation regardless of size)
- Add section headers for visual organization
- Extract releaseStream helper to reduce duplication
- Improve variable names (recorder, chunks, buffer)
- Add early returns to reduce nesting in transcribe()
- Rename refs for clarity (shouldSendRef, wasCancelledRef)
- Better comments explaining the state machine and logic
---
 src/browser/hooks/useVoiceInput.ts | 202 +++++++++++++++++++----------
 1 file changed, 135 insertions(+), 67 deletions(-)

diff --git a/src/browser/hooks/useVoiceInput.ts b/src/browser/hooks/useVoiceInput.ts
index 8570be8843..46d273d3dd 100644
--- a/src/browser/hooks/useVoiceInput.ts
+++ b/src/browser/hooks/useVoiceInput.ts
@@ -1,8 +1,9 @@
 /**
- * Hook for voice input using OpenAI Whisper API via MediaRecorder.
+ * Voice input via OpenAI transcription (gpt-4o-transcribe).
  *
- * Records audio, sends to backend for Whisper transcription, returns text.
- * Hidden on mobile (native keyboards have built-in dictation).
+ * State machine: idle → recording → transcribing → idle
+ *
+ * Hidden on touch devices where native keyboard dictation is available.
  */
 
 import { useState, useCallback, useRef, useEffect } from "react";
@@ -12,6 +13,7 @@ export type VoiceInputState = "idle" | "recording" | "transcribing";
 export interface UseVoiceInputOptions {
   onTranscript: (text: string) => void;
   onError?: (error: string) => void;
+  /** Called after successful transcription if stop({ send: true }) was used */
   onSend?: () => void;
   openAIKeySet: boolean;
 }
@@ -20,69 +22,120 @@ export interface UseVoiceInputResult {
   state: VoiceInputState;
   isSupported: boolean;
   isApiKeySet: boolean;
-  /** Show UI on supported desktop platforms (mobile has native dictation) */
+  /** False on touch devices (they have native keyboard dictation) */
   shouldShowUI: boolean;
   start: () => void;
   stop: (options?: { send?: boolean }) => void;
-  /** Cancel recording without transcribing (discard audio) */
   cancel: () => void;
   toggle: () => void;
 }
 
-// Platform checks (evaluated once)
-const isMobile =
-  typeof window !== "undefined" &&
-  ("ontouchstart" in window || navigator.maxTouchPoints > 0) &&
-  window.innerWidth < 768;
+// =============================================================================
+// Platform Detection
+// =============================================================================
+
+/**
+ * Detect touch devices where native keyboard dictation is typically available.
+ * This includes phones, tablets (iPad), and touch-enabled laptops in tablet mode.
+ * We hide our voice UI on these devices to avoid redundancy with system dictation.
+ */
+function hasTouchDictation(): boolean {
+  if (typeof window === "undefined") return false;
+  const hasTouch = "ontouchstart" in window || navigator.maxTouchPoints > 0;
+  // Touch-only check: most touch devices have native dictation.
+  // We don't check screen size because iPads are large but still have dictation.
+  return hasTouch;
+}
+
+const HAS_TOUCH_DICTATION = hasTouchDictation();
+const HAS_MEDIA_RECORDER = typeof window !== "undefined" && typeof MediaRecorder !== "undefined";
 
-const isSupported = typeof window !== "undefined" && typeof MediaRecorder !== "undefined";
+// =============================================================================
+// Hook
+// =============================================================================
 
 export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResult {
   const [state, setState] = useState<VoiceInputState>("idle");
 
-  const mediaRecorderRef = useRef<MediaRecorder | null>(null);
-  const audioChunksRef = useRef<Blob[]>([]);
+  // Refs for MediaRecorder lifecycle
+  const recorderRef = useRef<MediaRecorder | null>(null);
   const streamRef = useRef<MediaStream | null>(null);
-  const sendAfterTranscribeRef = useRef(false);
-  const cancelledRef = useRef(false);
+  const chunksRef = useRef<Blob[]>([]);
 
-  // Store callbacks in refs to avoid stale closures
+  // Flags set before stopping to control post-stop behavior
+  const shouldSendRef = useRef(false);
+  const wasCancelledRef = useRef(false);
+
+  // Keep callbacks fresh without recreating functions
   const callbacksRef = useRef(options);
   useEffect(() => {
     callbacksRef.current = options;
   }, [options]);
 
+  // ---------------------------------------------------------------------------
+  // Transcription
+  // ---------------------------------------------------------------------------
+
   const transcribe = useCallback(async (audioBlob: Blob) => {
     setState("transcribing");
-    const shouldSend = sendAfterTranscribeRef.current;
-    sendAfterTranscribeRef.current = false;
+
+    // Capture and reset flags
+    const shouldSend = shouldSendRef.current;
+    shouldSendRef.current = false;
 
     try {
-      const arrayBuffer = await audioBlob.arrayBuffer();
+      // Encode audio as base64 for IPC transport
+      const buffer = await audioBlob.arrayBuffer();
       const base64 = btoa(
-        new Uint8Array(arrayBuffer).reduce((data, byte) => data + String.fromCharCode(byte), "")
+        new Uint8Array(buffer).reduce((str, byte) => str + String.fromCharCode(byte), "")
       );
 
       const result = await window.api.voice.transcribe(base64);
 
-      if (result.success && result.data.trim()) {
-        callbacksRef.current.onTranscript(result.data);
-        if (shouldSend) {
-          setTimeout(() => callbacksRef.current.onSend?.(), 0);
-        }
-      } else if (!result.success) {
+      if (!result.success) {
         callbacksRef.current.onError?.(result.error);
+        return;
+      }
+
+      const text = result.data.trim();
+      if (!text) return; // Empty transcription, nothing to do
+
+      callbacksRef.current.onTranscript(text);
+
+      // If stop({ send: true }) was called, trigger send after React flushes
+      if (shouldSend) {
+        setTimeout(() => callbacksRef.current.onSend?.(), 0);
       }
     } catch (err) {
-      const message = err instanceof Error ? err.message : String(err);
-      callbacksRef.current.onError?.(`Transcription failed: ${message}`);
+      const msg = err instanceof Error ? err.message : String(err);
+      callbacksRef.current.onError?.(`Transcription failed: ${msg}`);
     } finally {
       setState("idle");
     }
   }, []);
 
+  // ---------------------------------------------------------------------------
+  // Release microphone and clean up recorder
+  // ---------------------------------------------------------------------------
+
+  const releaseStream = useCallback(() => {
+    streamRef.current?.getTracks().forEach((t) => t.stop());
+    streamRef.current = null;
+  }, []);
+
+  // ---------------------------------------------------------------------------
+  // Start Recording
+  // ---------------------------------------------------------------------------
+
   const start = useCallback(async () => {
-    if (!isSupported || isMobile || state !== "idle" || !callbacksRef.current.openAIKeySet) return;
+    // Guard: only start from idle state with valid configuration
+    const canStart =
+      HAS_MEDIA_RECORDER &&
+      !HAS_TOUCH_DICTATION &&
+      state === "idle" &&
+      callbacksRef.current.openAIKeySet;
+
+    if (!canStart) return;
 
     try {
       const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
@@ -93,20 +146,22 @@ export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResul
         : "audio/webm";
 
       const recorder = new MediaRecorder(stream, { mimeType });
-      audioChunksRef.current = [];
+      chunksRef.current = [];
 
       recorder.ondataavailable = (e) => {
-        if (e.data.size > 0) audioChunksRef.current.push(e.data);
+        if (e.data.size > 0) chunksRef.current.push(e.data);
       };
 
       recorder.onstop = () => {
-        const wasCancelled = cancelledRef.current;
-        cancelledRef.current = false;
-        const blob = new Blob(audioChunksRef.current, { type: mimeType });
-        audioChunksRef.current = [];
-        stream.getTracks().forEach((t) => t.stop());
-        streamRef.current = null;
-        if (wasCancelled) {
+        // Check if this was a cancel (discard audio) or normal stop (transcribe)
+        const cancelled = wasCancelledRef.current;
+        wasCancelledRef.current = false;
+
+        const blob = new Blob(chunksRef.current, { type: mimeType });
+        chunksRef.current = [];
+        releaseStream();
+
+        if (cancelled) {
           setState("idle");
         } else {
           void transcribe(blob);
@@ -115,63 +170,76 @@ export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResul
 
       recorder.onerror = () => {
         callbacksRef.current.onError?.("Recording failed");
+        releaseStream();
         setState("idle");
-        stream.getTracks().forEach((t) => t.stop());
-        streamRef.current = null;
       };
 
-      mediaRecorderRef.current = recorder;
+      recorderRef.current = recorder;
       recorder.start();
       setState("recording");
     } catch (err) {
-      const message = err instanceof Error ? err.message : String(err);
-      const isPermissionError =
-        message.includes("Permission denied") || message.includes("NotAllowedError");
+      const msg = err instanceof Error ? err.message : String(err);
+      const isPermissionDenied = msg.includes("Permission denied") || msg.includes("NotAllowed");
+
       callbacksRef.current.onError?.(
-        isPermissionError
+        isPermissionDenied
           ? "Microphone access denied. Please allow microphone access and try again."
-          : `Failed to start recording: ${message}`
+          : `Failed to start recording: ${msg}`
       );
     }
-  }, [state, transcribe]);
+  }, [state, transcribe, releaseStream]);
+
+  // ---------------------------------------------------------------------------
+  // Stop Recording (triggers transcription)
+  // ---------------------------------------------------------------------------
 
   const stop = useCallback((options?: { send?: boolean }) => {
-    if (options?.send) sendAfterTranscribeRef.current = true;
-    if (mediaRecorderRef.current?.state !== "inactive") {
-      mediaRecorderRef.current?.stop();
-      mediaRecorderRef.current = null;
+    if (options?.send) shouldSendRef.current = true;
+
+    if (recorderRef.current?.state !== "inactive") {
+      recorderRef.current?.stop();
+      recorderRef.current = null;
     }
   }, []);
 
+  // ---------------------------------------------------------------------------
+  // Cancel Recording (discard audio, no transcription)
+  // ---------------------------------------------------------------------------
+
   const cancel = useCallback(() => {
-    cancelledRef.current = true;
-    if (mediaRecorderRef.current?.state !== "inactive") {
-      mediaRecorderRef.current?.stop();
-      mediaRecorderRef.current = null;
-    }
-  }, []);
+    wasCancelledRef.current = true;
+    stop();
+  }, [stop]);
+
+  // ---------------------------------------------------------------------------
+  // Toggle (convenience for keybinds)
+  // ---------------------------------------------------------------------------
 
   const toggle = useCallback(() => {
-    if (state === "recording") {
-      stop();
-    } else if (state === "idle") {
-      void start();
-    }
+    if (state === "recording") stop();
+    else if (state === "idle") void start();
   }, [state, start, stop]);
 
+  // ---------------------------------------------------------------------------
   // Cleanup on unmount
+  // ---------------------------------------------------------------------------
+
   useEffect(() => {
     return () => {
-      mediaRecorderRef.current?.stop();
-      streamRef.current?.getTracks().forEach((t) => t.stop());
+      recorderRef.current?.stop();
+      releaseStream();
     };
-  }, []);
+  }, [releaseStream]);
+
+  // ---------------------------------------------------------------------------
+  // Return
+  // ---------------------------------------------------------------------------
 
   return {
     state,
-    isSupported,
+    isSupported: HAS_MEDIA_RECORDER,
     isApiKeySet: callbacksRef.current.openAIKeySet,
-    shouldShowUI: isSupported && !isMobile,
+    shouldShowUI: HAS_MEDIA_RECORDER && !HAS_TOUCH_DICTATION,
     start: () => void start(),
     stop,
     cancel,

From 7e1294b6c768301dc90805bfaec31471805c4e79 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Tue, 2 Dec 2025 00:20:39 -0600
Subject: [PATCH 18/18] fix: make E2E test more specific for OpenAI provider
 button

---
 tests/e2e/scenarios/settings.spec.ts | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/e2e/scenarios/settings.spec.ts b/tests/e2e/scenarios/settings.spec.ts
index a6205d36e1..4ce2c4b15f 100644
--- a/tests/e2e/scenarios/settings.spec.ts
+++ b/tests/e2e/scenarios/settings.spec.ts
@@ -97,7 +97,9 @@ test.describe("Settings Modal", () => {
 
     // Verify all providers are listed with correct display names
     await expect(page.getByRole("button", { name: /Anthropic/i })).toBeVisible();
-    await expect(page.getByRole("button", { name: /OpenAI/i })).toBeVisible();
+    await expect(
+      page.getByRole("button", { name: /OpenAI/i }).filter({ has: page.getByText("OpenAI icon") })
+    ).toBeVisible();
     await expect(page.getByRole("button", { name: /Google/i })).toBeVisible();
     await expect(page.getByRole("button", { name: /xAI/i })).toBeVisible();
     await expect(page.getByRole("button", { name: /Ollama/i })).toBeVisible();