From 9c89d47b037544fe8dca97f0d260d54628e5a646 Mon Sep 17 00:00:00 2001 From: Ammar Date: Mon, 1 Dec 2025 22:41:47 -0600 Subject: [PATCH 01/18] =?UTF-8?q?=F0=9F=A4=96=20feat:=20add=20voice=20inpu?= =?UTF-8?q?t=20mode=20using=20OpenAI=20Whisper?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add Ctrl+D / Cmd+D keybind to toggle voice recording - Add mic button next to send button (hidden on mobile or when no OpenAI key) - Add command palette command for voice input toggle - Record audio via MediaRecorder, transcribe via Whisper API - Show recording indicator while capturing, spinner while transcribing - Append dictated text to existing input - Handle errors with user-friendly toast messages Requires OpenAI API key to be configured in Settings > Providers. _Generated with mux_ --- bun.lock | 3 + package.json | 1 + src/browser/api.ts | 3 + .../components/ChatInput/VoiceInputButton.tsx | 66 ++++++ src/browser/components/ChatInput/index.tsx | 178 ++++++++++++++-- src/browser/hooks/useVoiceInput.ts | 193 ++++++++++++++++++ src/browser/stories/mockFactory.ts | 3 + src/browser/utils/commandIds.ts | 1 + src/browser/utils/commands/sources.ts | 11 + src/browser/utils/ui/keybinds.ts | 5 + src/common/constants/events.ts | 7 + src/common/constants/ipc-constants.ts | 3 + src/common/types/ipc.ts | 4 + src/desktop/preload.ts | 4 + src/node/services/ipcMain.ts | 40 ++++ 15 files changed, 501 insertions(+), 21 deletions(-) create mode 100644 src/browser/components/ChatInput/VoiceInputButton.tsx create mode 100644 src/browser/hooks/useVoiceInput.ts diff --git a/bun.lock b/bun.lock index 899b1121ae..32ee53eb40 100644 --- a/bun.lock +++ b/bun.lock @@ -42,6 +42,7 @@ "minimist": "^1.2.8", "motion": "^12.23.24", "ollama-ai-provider-v2": "^1.5.4", + "openai": "^6.9.1", "rehype-harden": "^1.1.5", "shescape": "^2.1.6", "source-map-support": "^0.5.21", @@ -2688,6 +2689,8 @@ "oniguruma-to-es": ["oniguruma-to-es@4.3.3", "", { "dependencies": { "oniguruma-parser": "^0.12.1", "regex": "^6.0.1", "regex-recursion": "^6.0.2" } }, "sha512-rPiZhzC3wXwE59YQMRDodUwwT9FZ9nNBwQQfsd1wfdtlKEyCdRV0avrTcSZ5xlIvGRVPd/cx6ZN45ECmS39xvg=="], + "openai": ["openai@6.9.1", "", { "peerDependencies": { "ws": "^8.18.0", "zod": "^3.25 || ^4.0" }, "optionalPeers": ["ws", "zod"], "bin": { "openai": "bin/cli" } }, "sha512-vQ5Rlt0ZgB3/BNmTa7bIijYFhz3YBceAA3Z4JuoMSBftBF9YqFHIEhZakSs+O/Ad7EaoEimZvHxD5ylRjN11Lg=="], + "optionator": ["optionator@0.9.4", "", { "dependencies": { "deep-is": "^0.1.3", "fast-levenshtein": "^2.0.6", "levn": "^0.4.1", "prelude-ls": "^1.2.1", "type-check": "^0.4.0", "word-wrap": "^1.2.5" } }, "sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g=="], "ora": ["ora@5.4.1", "", { "dependencies": { "bl": "^4.1.0", "chalk": "^4.1.0", "cli-cursor": "^3.1.0", "cli-spinners": "^2.5.0", "is-interactive": "^1.0.0", "is-unicode-supported": "^0.1.0", "log-symbols": "^4.1.0", "strip-ansi": "^6.0.0", "wcwidth": "^1.0.1" } }, "sha512-5b6Y85tPxZZ7QytO+BQzysW31HJku27cRIlkbAXaNx+BdcVi+LlRFmVXzeF6a7JCwJpyw5c4b+YSVImQIrBpuQ=="], diff --git a/package.json b/package.json index 80ea0a48c4..28b2be79e3 100644 --- a/package.json +++ b/package.json @@ -83,6 +83,7 @@ "minimist": "^1.2.8", "motion": "^12.23.24", "ollama-ai-provider-v2": "^1.5.4", + "openai": "^6.9.1", "rehype-harden": "^1.1.5", "shescape": "^2.1.6", "source-map-support": "^0.5.21", diff --git a/src/browser/api.ts b/src/browser/api.ts index 33b9ad37ab..a98675cb4a 100644 --- a/src/browser/api.ts +++ b/src/browser/api.ts @@ -361,6 +361,9 @@ const webApi: IPCApi = { }, closeWindow: (workspaceId) => invokeIPC(IPC_CHANNELS.TERMINAL_WINDOW_CLOSE, workspaceId), }, + voice: { + transcribe: (audioBase64) => invokeIPC(IPC_CHANNELS.VOICE_TRANSCRIBE, audioBase64), + }, update: { check: () => invokeIPC(IPC_CHANNELS.UPDATE_CHECK), download: () => invokeIPC(IPC_CHANNELS.UPDATE_DOWNLOAD), diff --git a/src/browser/components/ChatInput/VoiceInputButton.tsx b/src/browser/components/ChatInput/VoiceInputButton.tsx new file mode 100644 index 0000000000..9673b36f68 --- /dev/null +++ b/src/browser/components/ChatInput/VoiceInputButton.tsx @@ -0,0 +1,66 @@ +/** + * Voice input button - floats inside the chat input textarea. + * Minimal footprint: just an icon that changes color based on state. + * + * Visual states: + * - Idle: Subtle gray mic icon + * - Recording: Red pulsing mic + * - Transcribing: Orange spinning loader + * - Hidden: When on mobile, unsupported, or no OpenAI key + */ + +import React from "react"; +import { Mic, Loader2 } from "lucide-react"; +import { TooltipWrapper, Tooltip } from "../Tooltip"; +import { formatKeybind, KEYBINDS } from "@/browser/utils/ui/keybinds"; +import { cn } from "@/common/lib/utils"; + +interface VoiceInputButtonProps { + isListening: boolean; + isTranscribing: boolean; + isSupported: boolean; + shouldShowUI: boolean; + onToggle: () => void; + disabled?: boolean; +} + +export const VoiceInputButton: React.FC = (props) => { + // Don't render if we shouldn't show UI (mobile, unsupported, or no OpenAI key) + if (!props.shouldShowUI) { + return null; + } + + const label = props.isTranscribing + ? "Transcribing..." + : props.isListening + ? "Stop recording" + : "Voice input"; + + const Icon = props.isTranscribing ? Loader2 : Mic; + + return ( + + + + {label} ({formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT)}) + + + ); +}; diff --git a/src/browser/components/ChatInput/index.tsx b/src/browser/components/ChatInput/index.tsx index 0a686843be..d7da34984c 100644 --- a/src/browser/components/ChatInput/index.tsx +++ b/src/browser/components/ChatInput/index.tsx @@ -65,6 +65,8 @@ import { cn } from "@/common/lib/utils"; import { CreationControls } from "./CreationControls"; import { useCreationWorkspace } from "./useCreationWorkspace"; import { useTutorial } from "@/browser/contexts/TutorialContext"; +import { useVoiceInput } from "@/browser/hooks/useVoiceInput"; +import { VoiceInputButton } from "./VoiceInputButton"; const LEADING_COMMAND_NOISE = /^(?:\s|\u200B|\u200C|\u200D|\u200E|\u200F|\uFEFF)+/; @@ -154,6 +156,45 @@ export const ChatInput: React.FC = (props) => { }); const { startSequence: startTutorial } = useTutorial(); + // Track if OpenAI API key is configured for voice input + const [openAIKeySet, setOpenAIKeySet] = useState(false); + + // Voice input handling - appends transcribed text to input + const handleVoiceTranscript = useCallback( + (text: string, _isFinal: boolean) => { + // Whisper only returns final results, append to input with space separator if needed + setInput((prev) => { + const separator = prev.length > 0 && !prev.endsWith(" ") ? " " : ""; + return prev + separator + text; + }); + }, + [setInput] + ); + + const handleVoiceError = useCallback( + (error: string) => { + // Map common errors to user-friendly messages + const errorMessages: Record = { + "not-allowed": "Microphone access denied. Please allow microphone access and try again.", + "no-speech": "No speech detected. Please try again.", + network: "Network error. Please check your connection.", + "audio-capture": "No microphone found. Please connect a microphone.", + }; + setToast({ + id: Date.now().toString(), + type: "error", + message: errorMessages[error] ?? `Voice input error: ${error}`, + }); + }, + [setToast] + ); + + const voiceInput = useVoiceInput({ + onTranscript: handleVoiceTranscript, + onError: handleVoiceError, + openAIKeySet, + }); + // Start creation tutorial when entering creation mode useEffect(() => { if (variant === "creation") { @@ -370,6 +411,28 @@ export const ChatInput: React.FC = (props) => { }; }, []); + // Check if OpenAI API key is configured (for voice input) + useEffect(() => { + let isMounted = true; + + const checkOpenAIKey = async () => { + try { + const config = await window.api.providers.getConfig(); + if (isMounted) { + setOpenAIKeySet(config.openai?.apiKeySet ?? false); + } + } catch (error) { + console.error("Failed to check OpenAI API key:", error); + } + }; + + void checkOpenAIKey(); + + return () => { + isMounted = false; + }; + }, []); + // Allow external components (e.g., CommandPalette, Queued message edits) to insert text useEffect(() => { const handler = (e: Event) => { @@ -437,6 +500,18 @@ export const ChatInput: React.FC = (props) => { window.removeEventListener(CUSTOM_EVENTS.THINKING_LEVEL_TOAST, handler as EventListener); }, [variant, props, setToast]); + // Listen for voice input toggle from command palette + useEffect(() => { + if (!voiceInput.shouldShowUI) return; + + const handler = () => { + voiceInput.toggleListening(); + }; + window.addEventListener(CUSTOM_EVENTS.TOGGLE_VOICE_INPUT, handler as EventListener); + return () => + window.removeEventListener(CUSTOM_EVENTS.TOGGLE_VOICE_INPUT, handler as EventListener); + }, [voiceInput]); + // Auto-focus chat input when workspace changes (workspace only) const workspaceIdForFocus = variant === "workspace" ? props.workspaceId : null; useEffect(() => { @@ -768,6 +843,13 @@ export const ChatInput: React.FC = (props) => { return; } + // Handle voice input toggle (Ctrl+D / Cmd+D) + if (matchesKeybind(e, KEYBINDS.TOGGLE_VOICE_INPUT) && voiceInput.shouldShowUI) { + e.preventDefault(); + voiceInput.toggleListening(); + return; + } + // Handle open model selector if (matchesKeybind(e, KEYBINDS.OPEN_MODEL_SELECTOR)) { e.preventDefault(); @@ -896,27 +978,81 @@ export const ChatInput: React.FC = (props) => { anchorRef={variant === "creation" ? inputRef : undefined} /> -
- 0 ? commandListId : undefined - } - aria-expanded={showCommandSuggestions && commandSuggestions.length > 0} - /> +
+ {/* Recording overlay - dramatically replaces textarea when recording */} + {voiceInput.isListening ? ( + + ) : ( + <> + 0 + ? commandListId + : undefined + } + aria-expanded={showCommandSuggestions && commandSuggestions.length > 0} + /> + {/* Floating voice input button inside textarea */} +
+ +
+ + )}
{/* Image attachments */} diff --git a/src/browser/hooks/useVoiceInput.ts b/src/browser/hooks/useVoiceInput.ts new file mode 100644 index 0000000000..e92b6e884e --- /dev/null +++ b/src/browser/hooks/useVoiceInput.ts @@ -0,0 +1,193 @@ +/** + * Hook for voice input using OpenAI Whisper API via MediaRecorder. + * + * Features: + * - Records audio using MediaRecorder (webm/opus format) + * - Sends to backend which calls OpenAI Whisper for transcription + * - Shows recording state while capturing + * - Shows transcribing state while processing + * - Hidden on mobile (native keyboards have built-in dictation) + * - Disabled when OpenAI API key not configured + */ + +import { useState, useCallback, useRef, useEffect } from "react"; + +// Check if we're on a mobile device (touch-based) +function isMobileDevice(): boolean { + if (typeof window === "undefined") return false; + // Check for touch capability and small screen as heuristics + return ("ontouchstart" in window || navigator.maxTouchPoints > 0) && window.innerWidth < 768; +} + +// Check if MediaRecorder is available +function isMediaRecorderSupported(): boolean { + return typeof window !== "undefined" && typeof MediaRecorder !== "undefined"; +} + +export interface UseVoiceInputOptions { + /** Called when transcript text is received */ + onTranscript: (text: string, isFinal: boolean) => void; + /** Called when an error occurs */ + onError?: (error: string) => void; + /** Whether OpenAI API key is configured */ + openAIKeySet: boolean; +} + +export interface UseVoiceInputResult { + /** Whether voice input is currently recording */ + isListening: boolean; + /** Whether transcription is in progress */ + isTranscribing: boolean; + /** Whether the browser supports MediaRecorder */ + isSupported: boolean; + /** Whether we should show voice UI (supported, not mobile, API key set) */ + shouldShowUI: boolean; + /** Start recording for voice input */ + startListening: () => void; + /** Stop recording and transcribe */ + stopListening: () => void; + /** Toggle recording state */ + toggleListening: () => void; +} + +export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResult { + const { onTranscript, onError, openAIKeySet } = options; + + const [isListening, setIsListening] = useState(false); + const [isTranscribing, setIsTranscribing] = useState(false); + + const mediaRecorderRef = useRef(null); + const audioChunksRef = useRef([]); + const streamRef = useRef(null); + + const isSupported = isMediaRecorderSupported(); + const isMobile = isMobileDevice(); + + // Store callbacks in refs to avoid recreating on every render + const onTranscriptRef = useRef(onTranscript); + const onErrorRef = useRef(onError); + useEffect(() => { + onTranscriptRef.current = onTranscript; + onErrorRef.current = onError; + }, [onTranscript, onError]); + + const transcribeAudio = useCallback(async (audioBlob: Blob) => { + setIsTranscribing(true); + try { + // Convert blob to base64 + const arrayBuffer = await audioBlob.arrayBuffer(); + const base64 = btoa( + new Uint8Array(arrayBuffer).reduce((data, byte) => data + String.fromCharCode(byte), "") + ); + + // Call backend to transcribe + const result = await window.api.voice.transcribe(base64); + + if (result.success) { + if (result.data.trim()) { + onTranscriptRef.current(result.data, true); + } + } else { + onErrorRef.current?.(result.error); + } + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + onErrorRef.current?.(`Transcription failed: ${message}`); + } finally { + setIsTranscribing(false); + } + }, []); + + const startListening = useCallback(async () => { + if (!isSupported || isListening || isTranscribing || !openAIKeySet) return; + + try { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + streamRef.current = stream; + + // Use webm/opus which is well supported and works with Whisper + const mimeType = MediaRecorder.isTypeSupported("audio/webm;codecs=opus") + ? "audio/webm;codecs=opus" + : "audio/webm"; + + const mediaRecorder = new MediaRecorder(stream, { mimeType }); + audioChunksRef.current = []; + + mediaRecorder.ondataavailable = (event) => { + if (event.data.size > 0) { + audioChunksRef.current.push(event.data); + } + }; + + mediaRecorder.onstop = () => { + const audioBlob = new Blob(audioChunksRef.current, { type: mimeType }); + audioChunksRef.current = []; + + // Stop all tracks to release microphone + stream.getTracks().forEach((track) => track.stop()); + streamRef.current = null; + + // Transcribe the audio + void transcribeAudio(audioBlob); + }; + + mediaRecorder.onerror = () => { + onErrorRef.current?.("Recording failed"); + setIsListening(false); + stream.getTracks().forEach((track) => track.stop()); + streamRef.current = null; + }; + + mediaRecorderRef.current = mediaRecorder; + mediaRecorder.start(); + setIsListening(true); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + if (message.includes("Permission denied") || message.includes("NotAllowedError")) { + onErrorRef.current?.( + "Microphone access denied. Please allow microphone access and try again." + ); + } else { + onErrorRef.current?.(`Failed to start recording: ${message}`); + } + } + }, [isSupported, isListening, isTranscribing, openAIKeySet, transcribeAudio]); + + const stopListening = useCallback(() => { + if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") { + mediaRecorderRef.current.stop(); + mediaRecorderRef.current = null; + } + setIsListening(false); + }, []); + + const toggleListening = useCallback(() => { + if (isListening) { + stopListening(); + } else { + void startListening(); + } + }, [isListening, startListening, stopListening]); + + // Cleanup on unmount + useEffect(() => { + return () => { + if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") { + mediaRecorderRef.current.stop(); + } + if (streamRef.current) { + streamRef.current.getTracks().forEach((track) => track.stop()); + } + }; + }, []); + + return { + isListening, + isTranscribing, + isSupported, + shouldShowUI: isSupported && !isMobile && openAIKeySet, + startListening: () => void startListening(), + stopListening, + toggleListening, + }; +} diff --git a/src/browser/stories/mockFactory.ts b/src/browser/stories/mockFactory.ts index 29916ebca6..3de5878047 100644 --- a/src/browser/stories/mockFactory.ts +++ b/src/browser/stories/mockFactory.ts @@ -456,6 +456,9 @@ export function createMockAPI(options: MockAPIOptions): IPCApi { openWindow: () => Promise.resolve(undefined), closeWindow: () => Promise.resolve(undefined), }, + voice: { + transcribe: () => Promise.resolve({ success: false, error: "Not implemented in mock" }), + }, update: { check: () => Promise.resolve(undefined), download: () => Promise.resolve(undefined), diff --git a/src/browser/utils/commandIds.ts b/src/browser/utils/commandIds.ts index 8976082fc4..cc92aa0057 100644 --- a/src/browser/utils/commandIds.ts +++ b/src/browser/utils/commandIds.ts @@ -39,6 +39,7 @@ export const CommandIds = { chatTruncate: (pct: number) => `${COMMAND_ID_PREFIXES.CHAT_TRUNCATE}${pct}` as const, chatInterrupt: () => "chat:interrupt" as const, chatJumpBottom: () => "chat:jumpBottom" as const, + chatVoiceInput: () => "chat:voiceInput" as const, // Mode commands modeToggle: () => "mode:toggle" as const, diff --git a/src/browser/utils/commands/sources.ts b/src/browser/utils/commands/sources.ts index 09029e5f44..8738f6b5ee 100644 --- a/src/browser/utils/commands/sources.ts +++ b/src/browser/utils/commands/sources.ts @@ -388,6 +388,17 @@ export function buildCoreSources(p: BuildSourcesParams): Array<() => CommandActi window.dispatchEvent(ev); }, }); + list.push({ + id: CommandIds.chatVoiceInput(), + title: "Toggle Voice Input", + subtitle: "Dictate instead of typing", + section: section.chat, + shortcutHint: formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT), + run: () => { + // Dispatch custom event; ChatInput listens for it + window.dispatchEvent(createCustomEvent(CUSTOM_EVENTS.TOGGLE_VOICE_INPUT)); + }, + }); } return list; }); diff --git a/src/browser/utils/ui/keybinds.ts b/src/browser/utils/ui/keybinds.ts index 0a85f645b4..0bc979c542 100644 --- a/src/browser/utils/ui/keybinds.ts +++ b/src/browser/utils/ui/keybinds.ts @@ -285,4 +285,9 @@ export const KEYBINDS = { /** Open settings modal */ // macOS: Cmd+, Win/Linux: Ctrl+, OPEN_SETTINGS: { key: ",", ctrl: true }, + + /** Toggle voice input (dictation) */ + // macOS: Cmd+D, Win/Linux: Ctrl+D + // "D" for Dictate - intuitive and available + TOGGLE_VOICE_INPUT: { key: "d", ctrl: true }, } as const; diff --git a/src/common/constants/events.ts b/src/common/constants/events.ts index ccbd592113..807bc226fa 100644 --- a/src/common/constants/events.ts +++ b/src/common/constants/events.ts @@ -56,6 +56,12 @@ export const CUSTOM_EVENTS = { * Detail: { projectPath: string, startMessage?: string, model?: string, trunkBranch?: string, runtime?: string } */ START_WORKSPACE_CREATION: "mux:startWorkspaceCreation", + + /** + * Event to toggle voice input (dictation) mode + * No detail + */ + TOGGLE_VOICE_INPUT: "mux:toggleVoiceInput", } as const; /** @@ -94,6 +100,7 @@ export interface CustomEventPayloads { trunkBranch?: string; runtime?: string; }; + [CUSTOM_EVENTS.TOGGLE_VOICE_INPUT]: never; // No payload } /** diff --git a/src/common/constants/ipc-constants.ts b/src/common/constants/ipc-constants.ts index be7bc45ccf..c335928a09 100644 --- a/src/common/constants/ipc-constants.ts +++ b/src/common/constants/ipc-constants.ts @@ -68,6 +68,9 @@ export const IPC_CHANNELS = { TOKENIZER_COUNT_TOKENS: "tokenizer:countTokens", TOKENIZER_COUNT_TOKENS_BATCH: "tokenizer:countTokensBatch", + // Voice channels + VOICE_TRANSCRIBE: "voice:transcribe", + // Dynamic channel prefixes WORKSPACE_CHAT_PREFIX: "workspace:chat:", WORKSPACE_METADATA: "workspace:metadata", diff --git a/src/common/types/ipc.ts b/src/common/types/ipc.ts index 22f844e504..d07846257e 100644 --- a/src/common/types/ipc.ts +++ b/src/common/types/ipc.ts @@ -371,6 +371,10 @@ export interface IPCApi { openWindow(workspaceId: string): Promise; closeWindow(workspaceId: string): Promise; }; + voice: { + /** Transcribe audio using OpenAI Whisper. Audio should be base64-encoded webm/opus. */ + transcribe(audioBase64: string): Promise>; + }; update: { check(): Promise; download(): Promise; diff --git a/src/desktop/preload.ts b/src/desktop/preload.ts index 8a9ea1c71c..8ac8a5f8d7 100644 --- a/src/desktop/preload.ts +++ b/src/desktop/preload.ts @@ -160,6 +160,10 @@ const api: IPCApi = { window: { setTitle: (title: string) => ipcRenderer.invoke(IPC_CHANNELS.WINDOW_SET_TITLE, title), }, + voice: { + transcribe: (audioBase64: string) => + ipcRenderer.invoke(IPC_CHANNELS.VOICE_TRANSCRIBE, audioBase64), + }, update: { check: () => ipcRenderer.invoke(IPC_CHANNELS.UPDATE_CHECK), download: () => ipcRenderer.invoke(IPC_CHANNELS.UPDATE_DOWNLOAD), diff --git a/src/node/services/ipcMain.ts b/src/node/services/ipcMain.ts index 7de28a0d7e..9fa51fb784 100644 --- a/src/node/services/ipcMain.ts +++ b/src/node/services/ipcMain.ts @@ -44,6 +44,7 @@ import { PTYService } from "@/node/services/ptyService"; import type { TerminalWindowManager } from "@/desktop/terminalWindowManager"; import type { TerminalCreateParams, TerminalResizeParams } from "@/common/types/terminal"; import { ExtensionMetadataService } from "@/node/services/ExtensionMetadataService"; +import OpenAI from "openai"; /** Maximum number of retry attempts when workspace name collides */ const MAX_WORKSPACE_NAME_COLLISION_RETRIES = 3; @@ -633,6 +634,45 @@ export class IpcMain { } private registerWorkspaceHandlers(ipcMain: ElectronIpcMain): void { + // Voice transcription handler (uses OpenAI Whisper) + ipcMain.handle( + IPC_CHANNELS.VOICE_TRANSCRIBE, + async (_event, audioBase64: string): Promise> => { + try { + // Get OpenAI config + const providersConfig = this.config.loadProvidersConfig(); + const openaiConfig = providersConfig?.openai; + + if (!openaiConfig?.apiKey) { + return Err("OpenAI API key not configured. Set it in Settings > Providers."); + } + + const client = new OpenAI({ + apiKey: openaiConfig.apiKey, + baseURL: openaiConfig.baseUrl ?? openaiConfig.baseURL, + }); + + // Convert base64 to buffer + const audioBuffer = Buffer.from(audioBase64, "base64"); + + // Create a File object for the API + const audioFile = new File([audioBuffer], "audio.webm", { type: "audio/webm" }); + + // Call Whisper API + const transcription = await client.audio.transcriptions.create({ + file: audioFile, + model: "whisper-1", + }); + + return Ok(transcription.text); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + log.error("[IpcMain] Voice transcription failed", error); + return Err(`Transcription failed: ${message}`); + } + } + ); + ipcMain.handle( IPC_CHANNELS.WORKSPACE_CREATE, async ( From e54dbf9b7a5f9b109b64e3fccaf943f4c15c47d8 Mon Sep 17 00:00:00 2001 From: Ammar Date: Mon, 1 Dec 2025 23:34:52 -0600 Subject: [PATCH 02/18] feat: improve voice recording UI states - Show overlay during both recording AND transcribing states (prevents jarring snap-back to empty textarea when waiting for API) - Change colors from red (error-like) to blue (recording) and amber (transcribing) - Disable overlay button while transcribing to prevent double-clicks --- .../components/ChatInput/VoiceInputButton.tsx | 8 ++-- src/browser/components/ChatInput/index.tsx | 39 ++++++++++++++----- 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/src/browser/components/ChatInput/VoiceInputButton.tsx b/src/browser/components/ChatInput/VoiceInputButton.tsx index 9673b36f68..15548135bc 100644 --- a/src/browser/components/ChatInput/VoiceInputButton.tsx +++ b/src/browser/components/ChatInput/VoiceInputButton.tsx @@ -4,8 +4,8 @@ * * Visual states: * - Idle: Subtle gray mic icon - * - Recording: Red pulsing mic - * - Transcribing: Orange spinning loader + * - Recording: Blue pulsing mic + * - Transcribing: Amber spinning loader * - Hidden: When on mobile, unsupported, or no OpenAI key */ @@ -50,9 +50,9 @@ export const VoiceInputButton: React.FC = (props) => { "inline-flex items-center justify-center rounded p-0.5 transition-colors duration-150", "disabled:cursor-not-allowed disabled:opacity-40", props.isTranscribing - ? "text-orange-500" + ? "text-amber-500" : props.isListening - ? "text-red-500 animate-pulse" + ? "text-blue-500 animate-pulse" : "text-muted/50 hover:text-muted" )} > diff --git a/src/browser/components/ChatInput/index.tsx b/src/browser/components/ChatInput/index.tsx index d7da34984c..3a14037c6a 100644 --- a/src/browser/components/ChatInput/index.tsx +++ b/src/browser/components/ChatInput/index.tsx @@ -979,20 +979,29 @@ export const ChatInput: React.FC = (props) => { />
- {/* Recording overlay - dramatically replaces textarea when recording */} - {voiceInput.isListening ? ( + {/* Recording/transcribing overlay - dramatically replaces textarea */} + {voiceInput.isListening || voiceInput.isTranscribing ? ( - {label} ({formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT)}) + {needsApiKey ? ( + <> + Voice input requires OpenAI API key. +
+ Configure in Settings → Providers. + + ) : ( + <> + {label} ({formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT)}) + + )}
); diff --git a/src/browser/components/ChatInput/WaveformBars.tsx b/src/browser/components/ChatInput/WaveformBars.tsx new file mode 100644 index 0000000000..e8c834016b --- /dev/null +++ b/src/browser/components/ChatInput/WaveformBars.tsx @@ -0,0 +1,32 @@ +/** + * Animated waveform bars for voice recording UI. + * Shows 5 bars with staggered pulse animation. + */ + +import { cn } from "@/common/lib/utils"; + +interface WaveformBarsProps { + /** Color class for the bars (e.g., "bg-blue-500") */ + colorClass: string; + /** Whether to mirror the animation (for right-side waveform) */ + mirrored?: boolean; +} + +export const WaveformBars: React.FC = (props) => { + const indices = props.mirrored ? [4, 3, 2, 1, 0] : [0, 1, 2, 3, 4]; + + return ( +
+ {indices.map((i, displayIndex) => ( +
+ ))} +
+ ); +}; diff --git a/src/browser/components/ChatInput/index.tsx b/src/browser/components/ChatInput/index.tsx index 1531711ef4..303630546c 100644 --- a/src/browser/components/ChatInput/index.tsx +++ b/src/browser/components/ChatInput/index.tsx @@ -67,6 +67,7 @@ import { useCreationWorkspace } from "./useCreationWorkspace"; import { useTutorial } from "@/browser/contexts/TutorialContext"; import { useVoiceInput } from "@/browser/hooks/useVoiceInput"; import { VoiceInputButton } from "./VoiceInputButton"; +import { WaveformBars } from "./WaveformBars"; const LEADING_COMMAND_NOISE = /^(?:\s|\u200B|\u200C|\u200D|\u200E|\u200F|\uFEFF)+/; @@ -173,17 +174,10 @@ export const ChatInput: React.FC = (props) => { const handleVoiceError = useCallback( (error: string) => { - // Map common errors to user-friendly messages - const errorMessages: Record = { - "not-allowed": "Microphone access denied. Please allow microphone access and try again.", - "no-speech": "No speech detected. Please try again.", - network: "Network error. Please check your connection.", - "audio-capture": "No microphone found. Please connect a microphone.", - }; setToast({ id: Date.now().toString(), type: "error", - message: errorMessages[error] ?? `Voice input error: ${error}`, + message: error, }); }, [setToast] @@ -506,12 +500,20 @@ export const ChatInput: React.FC = (props) => { if (!voiceInput.shouldShowUI) return; const handler = () => { + if (!voiceInput.isApiKeySet) { + setToast({ + id: Date.now().toString(), + type: "error", + message: "Voice input requires OpenAI API key. Configure in Settings → Providers.", + }); + return; + } voiceInput.toggleListening(); }; window.addEventListener(CUSTOM_EVENTS.TOGGLE_VOICE_INPUT, handler as EventListener); return () => window.removeEventListener(CUSTOM_EVENTS.TOGGLE_VOICE_INPUT, handler as EventListener); - }, [voiceInput]); + }, [voiceInput, setToast]); // Auto-focus chat input when workspace changes (workspace only) const workspaceIdForFocus = variant === "workspace" ? props.workspaceId : null; @@ -847,6 +849,14 @@ export const ChatInput: React.FC = (props) => { // Handle voice input toggle (Ctrl+D / Cmd+D) if (matchesKeybind(e, KEYBINDS.TOGGLE_VOICE_INPUT) && voiceInput.shouldShowUI) { e.preventDefault(); + if (!voiceInput.isApiKeySet) { + setToast({ + id: Date.now().toString(), + type: "error", + message: "Voice input requires OpenAI API key. Configure in Settings → Providers.", + }); + return; + } voiceInput.toggleListening(); return; } @@ -1002,22 +1012,9 @@ export const ChatInput: React.FC = (props) => { )} aria-label={voiceInput.isListening ? "Stop recording" : "Transcribing..."} > - {/* Animated waveform bars */} -
- {[0, 1, 2, 3, 4].map((i) => ( -
- ))} -
+ = (props) => { ? `Recording... space to send, ${formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT)} to stop` : "Transcribing..."} -
- {[0, 1, 2, 3, 4].map((i) => ( -
- ))} -
+ ) : ( <> @@ -1074,6 +1060,7 @@ export const ChatInput: React.FC = (props) => { isListening={voiceInput.isListening} isTranscribing={voiceInput.isTranscribing} isSupported={voiceInput.isSupported} + isApiKeySet={voiceInput.isApiKeySet} shouldShowUI={voiceInput.shouldShowUI} onToggle={voiceInput.toggleListening} disabled={disabled || isSending} diff --git a/src/browser/hooks/useVoiceInput.ts b/src/browser/hooks/useVoiceInput.ts index d5375b522c..5f6c0d3670 100644 --- a/src/browser/hooks/useVoiceInput.ts +++ b/src/browser/hooks/useVoiceInput.ts @@ -42,7 +42,9 @@ export interface UseVoiceInputResult { isTranscribing: boolean; /** Whether the browser supports MediaRecorder */ isSupported: boolean; - /** Whether we should show voice UI (supported, not mobile, API key set) */ + /** Whether OpenAI API key is configured */ + isApiKeySet: boolean; + /** Whether we should show voice UI (supported and not mobile) */ shouldShowUI: boolean; /** Start recording for voice input */ startListening: () => void; @@ -205,7 +207,9 @@ export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResul isListening, isTranscribing, isSupported, - shouldShowUI: isSupported && !isMobile && openAIKeySet, + isApiKeySet: openAIKeySet, + // Show UI on supported desktop platforms (mobile has native dictation) + shouldShowUI: isSupported && !isMobile, startListening: () => void startListening(), stopListening, stopListeningAndSend, From adcec7cfaa2e042e8d7f1d2120920b19066cd085 Mon Sep 17 00:00:00 2001 From: Ammar Date: Mon, 1 Dec 2025 23:55:07 -0600 Subject: [PATCH 09/18] refactor: clean up useVoiceInput with enum state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace isListening/isTranscribing booleans with single state enum - Merge stopListening/stopListeningAndSend into stop(options?) - Rename methods: startListening→start, toggleListening→toggle - Consolidate callback refs into single callbacksRef object - Move platform checks (isMobile, isSupported) to module scope - Simplify VoiceInputButton with STATE_CONFIG lookup table - Inline simple callbacks in ChatInput (no separate handlers) --- .../components/ChatInput/VoiceInputButton.tsx | 50 ++--- src/browser/components/ChatInput/index.tsx | 60 ++--- src/browser/hooks/useVoiceInput.ts | 207 +++++++----------- 3 files changed, 115 insertions(+), 202 deletions(-) diff --git a/src/browser/components/ChatInput/VoiceInputButton.tsx b/src/browser/components/ChatInput/VoiceInputButton.tsx index 07103310f2..b4f27bede8 100644 --- a/src/browser/components/ChatInput/VoiceInputButton.tsx +++ b/src/browser/components/ChatInput/VoiceInputButton.tsx @@ -1,13 +1,6 @@ /** * Voice input button - floats inside the chat input textarea. * Minimal footprint: just an icon that changes color based on state. - * - * Visual states: - * - Idle: Subtle gray mic icon - * - Recording: Blue pulsing mic - * - Transcribing: Amber spinning loader - * - Disabled (no API key): Subtle gray with explanatory tooltip - * - Hidden: When on mobile or unsupported */ import React from "react"; @@ -15,55 +8,48 @@ import { Mic, Loader2 } from "lucide-react"; import { TooltipWrapper, Tooltip } from "../Tooltip"; import { formatKeybind, KEYBINDS } from "@/browser/utils/ui/keybinds"; import { cn } from "@/common/lib/utils"; +import type { VoiceInputState } from "@/browser/hooks/useVoiceInput"; interface VoiceInputButtonProps { - isListening: boolean; - isTranscribing: boolean; - isSupported: boolean; + state: VoiceInputState; isApiKeySet: boolean; shouldShowUI: boolean; onToggle: () => void; disabled?: boolean; } +const STATE_CONFIG: Record = { + idle: { label: "Voice input", colorClass: "text-muted/50 hover:text-muted" }, + recording: { label: "Stop recording", colorClass: "text-blue-500 animate-pulse" }, + transcribing: { label: "Transcribing...", colorClass: "text-amber-500" }, +}; + export const VoiceInputButton: React.FC = (props) => { - // Don't render on mobile or unsupported platforms - if (!props.shouldShowUI) { - return null; - } + if (!props.shouldShowUI) return null; const needsApiKey = !props.isApiKeySet; - const label = needsApiKey - ? "Voice input (requires OpenAI API key)" - : props.isTranscribing - ? "Transcribing..." - : props.isListening - ? "Stop recording" - : "Voice input"; + const { label, colorClass } = needsApiKey + ? { label: "Voice input (requires OpenAI API key)", colorClass: "text-muted/50" } + : STATE_CONFIG[props.state]; - const Icon = props.isTranscribing ? Loader2 : Mic; + const Icon = props.state === "transcribing" ? Loader2 : Mic; + const isTranscribing = props.state === "transcribing"; return ( {needsApiKey ? ( diff --git a/src/browser/components/ChatInput/index.tsx b/src/browser/components/ChatInput/index.tsx index 303630546c..7c733e013b 100644 --- a/src/browser/components/ChatInput/index.tsx +++ b/src/browser/components/ChatInput/index.tsx @@ -160,32 +160,17 @@ export const ChatInput: React.FC = (props) => { // Track if OpenAI API key is configured for voice input const [openAIKeySet, setOpenAIKeySet] = useState(false); - // Voice input handling - appends transcribed text to input - const handleVoiceTranscript = useCallback( - (text: string, _isFinal: boolean) => { - // Whisper only returns final results, append to input with space separator if needed + // Voice input - appends transcribed text to input + const voiceInput = useVoiceInput({ + onTranscript: (text) => { setInput((prev) => { const separator = prev.length > 0 && !prev.endsWith(" ") ? " " : ""; return prev + separator + text; }); }, - [setInput] - ); - - const handleVoiceError = useCallback( - (error: string) => { - setToast({ - id: Date.now().toString(), - type: "error", - message: error, - }); + onError: (error) => { + setToast({ id: Date.now().toString(), type: "error", message: error }); }, - [setToast] - ); - - const voiceInput = useVoiceInput({ - onTranscript: handleVoiceTranscript, - onError: handleVoiceError, onSend: () => void handleSend(), openAIKeySet, }); @@ -508,7 +493,7 @@ export const ChatInput: React.FC = (props) => { }); return; } - voiceInput.toggleListening(); + voiceInput.toggle(); }; window.addEventListener(CUSTOM_EVENTS.TOGGLE_VOICE_INPUT, handler as EventListener); return () => @@ -857,7 +842,7 @@ export const ChatInput: React.FC = (props) => { }); return; } - voiceInput.toggleListening(); + voiceInput.toggle(); return; } @@ -990,43 +975,42 @@ export const ChatInput: React.FC = (props) => { />
- {/* Recording/transcribing overlay - dramatically replaces textarea */} - {voiceInput.isListening || voiceInput.isTranscribing ? ( + {/* Recording/transcribing overlay - replaces textarea when active */} + {voiceInput.state !== "idle" ? ( @@ -1057,12 +1041,10 @@ export const ChatInput: React.FC = (props) => { {/* Floating voice input button inside textarea */}
diff --git a/src/browser/hooks/useVoiceInput.ts b/src/browser/hooks/useVoiceInput.ts index 5f6c0d3670..f196051f08 100644 --- a/src/browser/hooks/useVoiceInput.ts +++ b/src/browser/hooks/useVoiceInput.ts @@ -1,218 +1,163 @@ /** * Hook for voice input using OpenAI Whisper API via MediaRecorder. * - * Features: - * - Records audio using MediaRecorder (webm/opus format) - * - Sends to backend which calls OpenAI Whisper for transcription - * - Shows recording state while capturing - * - Shows transcribing state while processing - * - Hidden on mobile (native keyboards have built-in dictation) - * - Disabled when OpenAI API key not configured + * Records audio, sends to backend for Whisper transcription, returns text. + * Hidden on mobile (native keyboards have built-in dictation). */ import { useState, useCallback, useRef, useEffect } from "react"; -// Check if we're on a mobile device (touch-based) -function isMobileDevice(): boolean { - if (typeof window === "undefined") return false; - // Check for touch capability and small screen as heuristics - return ("ontouchstart" in window || navigator.maxTouchPoints > 0) && window.innerWidth < 768; -} - -// Check if MediaRecorder is available -function isMediaRecorderSupported(): boolean { - return typeof window !== "undefined" && typeof MediaRecorder !== "undefined"; -} +export type VoiceInputState = "idle" | "recording" | "transcribing"; export interface UseVoiceInputOptions { - /** Called when transcript text is received */ - onTranscript: (text: string, isFinal: boolean) => void; - /** Called when an error occurs */ + onTranscript: (text: string) => void; onError?: (error: string) => void; - /** Called to send the message (used by stopListeningAndSend) */ onSend?: () => void; - /** Whether OpenAI API key is configured */ openAIKeySet: boolean; } export interface UseVoiceInputResult { - /** Whether voice input is currently recording */ - isListening: boolean; - /** Whether transcription is in progress */ - isTranscribing: boolean; - /** Whether the browser supports MediaRecorder */ + state: VoiceInputState; isSupported: boolean; - /** Whether OpenAI API key is configured */ isApiKeySet: boolean; - /** Whether we should show voice UI (supported and not mobile) */ + /** Show UI on supported desktop platforms (mobile has native dictation) */ shouldShowUI: boolean; - /** Start recording for voice input */ - startListening: () => void; - /** Stop recording and transcribe */ - stopListening: () => void; - /** Stop recording, transcribe, and send when done */ - stopListeningAndSend: () => void; - /** Toggle recording state */ - toggleListening: () => void; + start: () => void; + stop: (options?: { send?: boolean }) => void; + toggle: () => void; } -export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResult { - const { onTranscript, onError, onSend, openAIKeySet } = options; +// Platform checks (evaluated once) +const isMobile = + typeof window !== "undefined" && + ("ontouchstart" in window || navigator.maxTouchPoints > 0) && + window.innerWidth < 768; - const [isListening, setIsListening] = useState(false); - const [isTranscribing, setIsTranscribing] = useState(false); +const isSupported = typeof window !== "undefined" && typeof MediaRecorder !== "undefined"; + +export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResult { + const [state, setState] = useState("idle"); const mediaRecorderRef = useRef(null); const audioChunksRef = useRef([]); const streamRef = useRef(null); - // Flag to auto-send after transcription completes const sendAfterTranscribeRef = useRef(false); - const isSupported = isMediaRecorderSupported(); - const isMobile = isMobileDevice(); - - // Store callbacks in refs to avoid recreating on every render - const onTranscriptRef = useRef(onTranscript); - const onErrorRef = useRef(onError); - const onSendRef = useRef(onSend); + // Store callbacks in refs to avoid stale closures + const callbacksRef = useRef(options); useEffect(() => { - onTranscriptRef.current = onTranscript; - onErrorRef.current = onError; - onSendRef.current = onSend; - }, [onTranscript, onError, onSend]); - - const transcribeAudio = useCallback(async (audioBlob: Blob) => { - setIsTranscribing(true); - const shouldSendAfter = sendAfterTranscribeRef.current; + callbacksRef.current = options; + }, [options]); + + const transcribe = useCallback(async (audioBlob: Blob) => { + setState("transcribing"); + const shouldSend = sendAfterTranscribeRef.current; sendAfterTranscribeRef.current = false; try { - // Convert blob to base64 const arrayBuffer = await audioBlob.arrayBuffer(); const base64 = btoa( new Uint8Array(arrayBuffer).reduce((data, byte) => data + String.fromCharCode(byte), "") ); - // Call backend to transcribe const result = await window.api.voice.transcribe(base64); - if (result.success) { - if (result.data.trim()) { - onTranscriptRef.current(result.data, true); - // Auto-send after transcript is set (use setTimeout to let React update state) - if (shouldSendAfter) { - setTimeout(() => onSendRef.current?.(), 0); - } + if (result.success && result.data.trim()) { + callbacksRef.current.onTranscript(result.data); + if (shouldSend) { + setTimeout(() => callbacksRef.current.onSend?.(), 0); } - } else { - onErrorRef.current?.(result.error); + } else if (!result.success) { + callbacksRef.current.onError?.(result.error); } } catch (err) { const message = err instanceof Error ? err.message : String(err); - onErrorRef.current?.(`Transcription failed: ${message}`); + callbacksRef.current.onError?.(`Transcription failed: ${message}`); } finally { - setIsTranscribing(false); + setState("idle"); } }, []); - const startListening = useCallback(async () => { - if (!isSupported || isListening || isTranscribing || !openAIKeySet) return; + const start = useCallback(async () => { + if (!isSupported || state !== "idle" || !callbacksRef.current.openAIKeySet) return; try { const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); streamRef.current = stream; - // Use webm/opus which is well supported and works with Whisper const mimeType = MediaRecorder.isTypeSupported("audio/webm;codecs=opus") ? "audio/webm;codecs=opus" : "audio/webm"; - const mediaRecorder = new MediaRecorder(stream, { mimeType }); + const recorder = new MediaRecorder(stream, { mimeType }); audioChunksRef.current = []; - mediaRecorder.ondataavailable = (event) => { - if (event.data.size > 0) { - audioChunksRef.current.push(event.data); - } + recorder.ondataavailable = (e) => { + if (e.data.size > 0) audioChunksRef.current.push(e.data); }; - mediaRecorder.onstop = () => { - const audioBlob = new Blob(audioChunksRef.current, { type: mimeType }); + recorder.onstop = () => { + const blob = new Blob(audioChunksRef.current, { type: mimeType }); audioChunksRef.current = []; - - // Stop all tracks to release microphone - stream.getTracks().forEach((track) => track.stop()); + stream.getTracks().forEach((t) => t.stop()); streamRef.current = null; - - // Transcribe the audio - void transcribeAudio(audioBlob); + void transcribe(blob); }; - mediaRecorder.onerror = () => { - onErrorRef.current?.("Recording failed"); - setIsListening(false); - stream.getTracks().forEach((track) => track.stop()); + recorder.onerror = () => { + callbacksRef.current.onError?.("Recording failed"); + setState("idle"); + stream.getTracks().forEach((t) => t.stop()); streamRef.current = null; }; - mediaRecorderRef.current = mediaRecorder; - mediaRecorder.start(); - setIsListening(true); + mediaRecorderRef.current = recorder; + recorder.start(); + setState("recording"); } catch (err) { const message = err instanceof Error ? err.message : String(err); - if (message.includes("Permission denied") || message.includes("NotAllowedError")) { - onErrorRef.current?.( - "Microphone access denied. Please allow microphone access and try again." - ); - } else { - onErrorRef.current?.(`Failed to start recording: ${message}`); - } + const isPermissionError = + message.includes("Permission denied") || message.includes("NotAllowedError"); + callbacksRef.current.onError?.( + isPermissionError + ? "Microphone access denied. Please allow microphone access and try again." + : `Failed to start recording: ${message}` + ); } - }, [isSupported, isListening, isTranscribing, openAIKeySet, transcribeAudio]); + }, [state, transcribe]); - const stopListening = useCallback(() => { - if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") { - mediaRecorderRef.current.stop(); + const stop = useCallback((options?: { send?: boolean }) => { + if (options?.send) sendAfterTranscribeRef.current = true; + if (mediaRecorderRef.current?.state !== "inactive") { + mediaRecorderRef.current?.stop(); mediaRecorderRef.current = null; } - setIsListening(false); + // Note: setState("idle") not called here - transcribe() handles transition }, []); - const stopListeningAndSend = useCallback(() => { - sendAfterTranscribeRef.current = true; - stopListening(); - }, [stopListening]); - - const toggleListening = useCallback(() => { - if (isListening) { - stopListening(); - } else { - void startListening(); + const toggle = useCallback(() => { + if (state === "recording") { + stop(); + } else if (state === "idle") { + void start(); } - }, [isListening, startListening, stopListening]); + }, [state, start, stop]); // Cleanup on unmount useEffect(() => { return () => { - if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") { - mediaRecorderRef.current.stop(); - } - if (streamRef.current) { - streamRef.current.getTracks().forEach((track) => track.stop()); - } + mediaRecorderRef.current?.stop(); + streamRef.current?.getTracks().forEach((t) => t.stop()); }; }, []); return { - isListening, - isTranscribing, + state, isSupported, - isApiKeySet: openAIKeySet, - // Show UI on supported desktop platforms (mobile has native dictation) + isApiKeySet: callbacksRef.current.openAIKeySet, shouldShowUI: isSupported && !isMobile, - startListening: () => void startListening(), - stopListening, - stopListeningAndSend, - toggleListening, + start: () => void start(), + stop, + toggle, }; } From 944e34143abe60c51a418845f316d678e75fd552 Mon Sep 17 00:00:00 2001 From: Ammar Date: Mon, 1 Dec 2025 23:59:58 -0600 Subject: [PATCH 10/18] feat: space on empty input starts voice, escape cancels - Pressing space on empty chat input starts voice recording (convenient alternative to Cmd+D) - Pressing escape during recording cancels without transcribing - Add cancel() method to voice hook that sets flag to skip transcribe - Updated overlay text to show all shortcuts --- src/browser/components/ChatInput/index.tsx | 18 +++++++++++++++++- src/browser/hooks/useVoiceInput.ts | 21 +++++++++++++++++++-- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/src/browser/components/ChatInput/index.tsx b/src/browser/components/ChatInput/index.tsx index 7c733e013b..b51aea6050 100644 --- a/src/browser/components/ChatInput/index.tsx +++ b/src/browser/components/ChatInput/index.tsx @@ -846,6 +846,19 @@ export const ChatInput: React.FC = (props) => { return; } + // Space on empty input starts voice recording + if ( + e.key === " " && + input.trim() === "" && + voiceInput.shouldShowUI && + voiceInput.isApiKeySet && + voiceInput.state === "idle" + ) { + e.preventDefault(); + voiceInput.start(); + return; + } + // Handle open model selector if (matchesKeybind(e, KEYBINDS.OPEN_MODEL_SELECTOR)) { e.preventDefault(); @@ -985,6 +998,9 @@ export const ChatInput: React.FC = (props) => { if (e.key === " " && voiceInput.state === "recording") { e.preventDefault(); voiceInput.stop({ send: true }); + } else if (e.key === "Escape" && voiceInput.state === "recording") { + e.preventDefault(); + voiceInput.cancel(); } }} disabled={voiceInput.state === "transcribing"} @@ -1006,7 +1022,7 @@ export const ChatInput: React.FC = (props) => { )} > {voiceInput.state === "recording" - ? `Recording... space to send, ${formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT)} to stop` + ? `Recording... space to send, ${formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT)} to stop, esc to cancel` : "Transcribing..."} void; stop: (options?: { send?: boolean }) => void; + /** Cancel recording without transcribing (discard audio) */ + cancel: () => void; toggle: () => void; } @@ -42,6 +44,7 @@ export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResul const audioChunksRef = useRef([]); const streamRef = useRef(null); const sendAfterTranscribeRef = useRef(false); + const cancelledRef = useRef(false); // Store callbacks in refs to avoid stale closures const callbacksRef = useRef(options); @@ -97,11 +100,17 @@ export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResul }; recorder.onstop = () => { + const wasCancelled = cancelledRef.current; + cancelledRef.current = false; const blob = new Blob(audioChunksRef.current, { type: mimeType }); audioChunksRef.current = []; stream.getTracks().forEach((t) => t.stop()); streamRef.current = null; - void transcribe(blob); + if (wasCancelled) { + setState("idle"); + } else { + void transcribe(blob); + } }; recorder.onerror = () => { @@ -132,7 +141,14 @@ export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResul mediaRecorderRef.current?.stop(); mediaRecorderRef.current = null; } - // Note: setState("idle") not called here - transcribe() handles transition + }, []); + + const cancel = useCallback(() => { + cancelledRef.current = true; + if (mediaRecorderRef.current?.state !== "inactive") { + mediaRecorderRef.current?.stop(); + mediaRecorderRef.current = null; + } }, []); const toggle = useCallback(() => { @@ -158,6 +174,7 @@ export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResul shouldShowUI: isSupported && !isMobile, start: () => void start(), stop, + cancel, toggle, }; } From f82b49f32d041087acc16a5a15d97f80718341dd Mon Sep 17 00:00:00 2001 From: Ammar Date: Tue, 2 Dec 2025 00:01:39 -0600 Subject: [PATCH 11/18] fix: remove ugly focus ring, improve voice tooltip - Add focus:outline-none to recording overlay button - Update tooltip to document all shortcuts: - Space on empty input to start - Cmd+D anytime to toggle - Space during recording to send - Escape to cancel --- src/browser/components/ChatInput/VoiceInputButton.tsx | 7 ++++++- src/browser/components/ChatInput/index.tsx | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/browser/components/ChatInput/VoiceInputButton.tsx b/src/browser/components/ChatInput/VoiceInputButton.tsx index b4f27bede8..228d330b5d 100644 --- a/src/browser/components/ChatInput/VoiceInputButton.tsx +++ b/src/browser/components/ChatInput/VoiceInputButton.tsx @@ -60,7 +60,12 @@ export const VoiceInputButton: React.FC = (props) => { ) : ( <> - {label} ({formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT)}) + Voice input — press space on empty input +
+ or {formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT)} anytime +
+
+ While recording: space sends, esc cancels )} diff --git a/src/browser/components/ChatInput/index.tsx b/src/browser/components/ChatInput/index.tsx index b51aea6050..c6e1b0c04c 100644 --- a/src/browser/components/ChatInput/index.tsx +++ b/src/browser/components/ChatInput/index.tsx @@ -1005,7 +1005,7 @@ export const ChatInput: React.FC = (props) => { }} disabled={voiceInput.state === "transcribing"} className={cn( - "mb-1 flex min-h-[60px] w-full items-center justify-center gap-3 rounded-md border px-4 py-4 transition-all", + "mb-1 flex min-h-[60px] w-full items-center justify-center gap-3 rounded-md border px-4 py-4 transition-all focus:outline-none", voiceInput.state === "recording" ? "cursor-pointer border-blue-500 bg-blue-500/10" : "cursor-wait border-amber-500 bg-amber-500/10" From 91c65e79d8c631431bd98c574ed62eff64bdbcae Mon Sep 17 00:00:00 2001 From: Ammar Date: Tue, 2 Dec 2025 00:02:48 -0600 Subject: [PATCH 12/18] fix: use gpt-4o-transcribe model instead of whisper-1 --- src/node/services/ipcMain.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/node/services/ipcMain.ts b/src/node/services/ipcMain.ts index 9fa51fb784..ebd89108bc 100644 --- a/src/node/services/ipcMain.ts +++ b/src/node/services/ipcMain.ts @@ -661,7 +661,7 @@ export class IpcMain { // Call Whisper API const transcription = await client.audio.transcriptions.create({ file: audioFile, - model: "whisper-1", + model: "gpt-4o-transcribe", }); return Ok(transcription.text); From 92e31f3af94bbd4110059b3abcc0f7a4c111fc11 Mon Sep 17 00:00:00 2001 From: Ammar Date: Tue, 2 Dec 2025 00:04:08 -0600 Subject: [PATCH 13/18] fix: global keybinds during recording work regardless of focus - Add window-level keydown listener active only during recording - Space and Escape work even if overlay button loses focus - Removed redundant local onKeyDown and auto-focus from button --- src/browser/components/ChatInput/index.tsx | 27 ++++++++++++++-------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/src/browser/components/ChatInput/index.tsx b/src/browser/components/ChatInput/index.tsx index c6e1b0c04c..f6b8814b61 100644 --- a/src/browser/components/ChatInput/index.tsx +++ b/src/browser/components/ChatInput/index.tsx @@ -500,6 +500,23 @@ export const ChatInput: React.FC = (props) => { window.removeEventListener(CUSTOM_EVENTS.TOGGLE_VOICE_INPUT, handler as EventListener); }, [voiceInput, setToast]); + // Global keybinds during recording (work regardless of focus) + useEffect(() => { + if (voiceInput.state !== "recording") return; + + const handler = (e: KeyboardEvent) => { + if (e.key === " ") { + e.preventDefault(); + voiceInput.stop({ send: true }); + } else if (e.key === "Escape") { + e.preventDefault(); + voiceInput.cancel(); + } + }; + window.addEventListener("keydown", handler); + return () => window.removeEventListener("keydown", handler); + }, [voiceInput]); + // Auto-focus chat input when workspace changes (workspace only) const workspaceIdForFocus = variant === "workspace" ? props.workspaceId : null; useEffect(() => { @@ -992,17 +1009,7 @@ export const ChatInput: React.FC = (props) => { {voiceInput.state !== "idle" ? (