diff --git a/bun.lock b/bun.lock index 899b1121ae..32ee53eb40 100644 --- a/bun.lock +++ b/bun.lock @@ -42,6 +42,7 @@ "minimist": "^1.2.8", "motion": "^12.23.24", "ollama-ai-provider-v2": "^1.5.4", + "openai": "^6.9.1", "rehype-harden": "^1.1.5", "shescape": "^2.1.6", "source-map-support": "^0.5.21", @@ -2688,6 +2689,8 @@ "oniguruma-to-es": ["oniguruma-to-es@4.3.3", "", { "dependencies": { "oniguruma-parser": "^0.12.1", "regex": "^6.0.1", "regex-recursion": "^6.0.2" } }, "sha512-rPiZhzC3wXwE59YQMRDodUwwT9FZ9nNBwQQfsd1wfdtlKEyCdRV0avrTcSZ5xlIvGRVPd/cx6ZN45ECmS39xvg=="], + "openai": ["openai@6.9.1", "", { "peerDependencies": { "ws": "^8.18.0", "zod": "^3.25 || ^4.0" }, "optionalPeers": ["ws", "zod"], "bin": { "openai": "bin/cli" } }, "sha512-vQ5Rlt0ZgB3/BNmTa7bIijYFhz3YBceAA3Z4JuoMSBftBF9YqFHIEhZakSs+O/Ad7EaoEimZvHxD5ylRjN11Lg=="], + "optionator": ["optionator@0.9.4", "", { "dependencies": { "deep-is": "^0.1.3", "fast-levenshtein": "^2.0.6", "levn": "^0.4.1", "prelude-ls": "^1.2.1", "type-check": "^0.4.0", "word-wrap": "^1.2.5" } }, "sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g=="], "ora": ["ora@5.4.1", "", { "dependencies": { "bl": "^4.1.0", "chalk": "^4.1.0", "cli-cursor": "^3.1.0", "cli-spinners": "^2.5.0", "is-interactive": "^1.0.0", "is-unicode-supported": "^0.1.0", "log-symbols": "^4.1.0", "strip-ansi": "^6.0.0", "wcwidth": "^1.0.1" } }, "sha512-5b6Y85tPxZZ7QytO+BQzysW31HJku27cRIlkbAXaNx+BdcVi+LlRFmVXzeF6a7JCwJpyw5c4b+YSVImQIrBpuQ=="], diff --git a/package.json b/package.json index 80ea0a48c4..28b2be79e3 100644 --- a/package.json +++ b/package.json @@ -83,6 +83,7 @@ "minimist": "^1.2.8", "motion": "^12.23.24", "ollama-ai-provider-v2": "^1.5.4", + "openai": "^6.9.1", "rehype-harden": "^1.1.5", "shescape": "^2.1.6", "source-map-support": "^0.5.21", diff --git a/src/browser/api.ts b/src/browser/api.ts index 33b9ad37ab..a98675cb4a 100644 --- a/src/browser/api.ts +++ b/src/browser/api.ts @@ -361,6 +361,9 @@ const webApi: IPCApi = { }, closeWindow: (workspaceId) => invokeIPC(IPC_CHANNELS.TERMINAL_WINDOW_CLOSE, workspaceId), }, + voice: { + transcribe: (audioBase64) => invokeIPC(IPC_CHANNELS.VOICE_TRANSCRIBE, audioBase64), + }, update: { check: () => invokeIPC(IPC_CHANNELS.UPDATE_CHECK), download: () => invokeIPC(IPC_CHANNELS.UPDATE_DOWNLOAD), diff --git a/src/browser/components/ChatInput/VoiceInputButton.tsx b/src/browser/components/ChatInput/VoiceInputButton.tsx new file mode 100644 index 0000000000..228d330b5d --- /dev/null +++ b/src/browser/components/ChatInput/VoiceInputButton.tsx @@ -0,0 +1,74 @@ +/** + * Voice input button - floats inside the chat input textarea. + * Minimal footprint: just an icon that changes color based on state. + */ + +import React from "react"; +import { Mic, Loader2 } from "lucide-react"; +import { TooltipWrapper, Tooltip } from "../Tooltip"; +import { formatKeybind, KEYBINDS } from "@/browser/utils/ui/keybinds"; +import { cn } from "@/common/lib/utils"; +import type { VoiceInputState } from "@/browser/hooks/useVoiceInput"; + +interface VoiceInputButtonProps { + state: VoiceInputState; + isApiKeySet: boolean; + shouldShowUI: boolean; + onToggle: () => void; + disabled?: boolean; +} + +const STATE_CONFIG: Record = { + idle: { label: "Voice input", colorClass: "text-muted/50 hover:text-muted" }, + recording: { label: "Stop recording", colorClass: "text-blue-500 animate-pulse" }, + transcribing: { label: "Transcribing...", colorClass: "text-amber-500" }, +}; + +export const VoiceInputButton: React.FC = (props) => { + if (!props.shouldShowUI) return null; + + const needsApiKey = !props.isApiKeySet; + const { label, colorClass } = needsApiKey + ? { label: "Voice input (requires OpenAI API key)", colorClass: "text-muted/50" } + : STATE_CONFIG[props.state]; + + const Icon = props.state === "transcribing" ? Loader2 : Mic; + const isTranscribing = props.state === "transcribing"; + + return ( + + + + {needsApiKey ? ( + <> + Voice input requires OpenAI API key. +
+ Configure in Settings → Providers. + + ) : ( + <> + Voice input — press space on empty input +
+ or {formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT)} anytime +
+
+ While recording: space sends, esc cancels + + )} +
+
+ ); +}; diff --git a/src/browser/components/ChatInput/WaveformBars.tsx b/src/browser/components/ChatInput/WaveformBars.tsx new file mode 100644 index 0000000000..e8c834016b --- /dev/null +++ b/src/browser/components/ChatInput/WaveformBars.tsx @@ -0,0 +1,32 @@ +/** + * Animated waveform bars for voice recording UI. + * Shows 5 bars with staggered pulse animation. + */ + +import { cn } from "@/common/lib/utils"; + +interface WaveformBarsProps { + /** Color class for the bars (e.g., "bg-blue-500") */ + colorClass: string; + /** Whether to mirror the animation (for right-side waveform) */ + mirrored?: boolean; +} + +export const WaveformBars: React.FC = (props) => { + const indices = props.mirrored ? [4, 3, 2, 1, 0] : [0, 1, 2, 3, 4]; + + return ( +
+ {indices.map((i, displayIndex) => ( +
+ ))} +
+ ); +}; diff --git a/src/browser/components/ChatInput/index.tsx b/src/browser/components/ChatInput/index.tsx index 0a686843be..0d6e215390 100644 --- a/src/browser/components/ChatInput/index.tsx +++ b/src/browser/components/ChatInput/index.tsx @@ -65,6 +65,9 @@ import { cn } from "@/common/lib/utils"; import { CreationControls } from "./CreationControls"; import { useCreationWorkspace } from "./useCreationWorkspace"; import { useTutorial } from "@/browser/contexts/TutorialContext"; +import { useVoiceInput } from "@/browser/hooks/useVoiceInput"; +import { VoiceInputButton } from "./VoiceInputButton"; +import { WaveformBars } from "./WaveformBars"; const LEADING_COMMAND_NOISE = /^(?:\s|\u200B|\u200C|\u200D|\u200E|\u200F|\uFEFF)+/; @@ -154,6 +157,24 @@ export const ChatInput: React.FC = (props) => { }); const { startSequence: startTutorial } = useTutorial(); + // Track if OpenAI API key is configured for voice input + const [openAIKeySet, setOpenAIKeySet] = useState(false); + + // Voice input - appends transcribed text to input + const voiceInput = useVoiceInput({ + onTranscript: (text) => { + setInput((prev) => { + const separator = prev.length > 0 && !prev.endsWith(" ") ? " " : ""; + return prev + separator + text; + }); + }, + onError: (error) => { + setToast({ id: Date.now().toString(), type: "error", message: error }); + }, + onSend: () => void handleSend(), + openAIKeySet, + }); + // Start creation tutorial when entering creation mode useEffect(() => { if (variant === "creation") { @@ -370,6 +391,28 @@ export const ChatInput: React.FC = (props) => { }; }, []); + // Check if OpenAI API key is configured (for voice input) + useEffect(() => { + let isMounted = true; + + const checkOpenAIKey = async () => { + try { + const config = await window.api.providers.getConfig(); + if (isMounted) { + setOpenAIKeySet(config.openai?.apiKeySet ?? false); + } + } catch (error) { + console.error("Failed to check OpenAI API key:", error); + } + }; + + void checkOpenAIKey(); + + return () => { + isMounted = false; + }; + }, []); + // Allow external components (e.g., CommandPalette, Queued message edits) to insert text useEffect(() => { const handler = (e: Event) => { @@ -437,6 +480,42 @@ export const ChatInput: React.FC = (props) => { window.removeEventListener(CUSTOM_EVENTS.THINKING_LEVEL_TOAST, handler as EventListener); }, [variant, props, setToast]); + // Voice input: command palette toggle + global recording keybinds + useEffect(() => { + if (!voiceInput.shouldShowUI) return; + + const handleToggle = () => { + if (!voiceInput.isApiKeySet) { + setToast({ + id: Date.now().toString(), + type: "error", + message: "Voice input requires OpenAI API key. Configure in Settings → Providers.", + }); + return; + } + voiceInput.toggle(); + }; + + // Global keybinds only active during recording + const handleKeyDown = (e: KeyboardEvent) => { + if (voiceInput.state !== "recording") return; + if (e.key === " ") { + e.preventDefault(); + voiceInput.stop({ send: true }); + } else if (e.key === "Escape") { + e.preventDefault(); + voiceInput.cancel(); + } + }; + + window.addEventListener(CUSTOM_EVENTS.TOGGLE_VOICE_INPUT, handleToggle as EventListener); + window.addEventListener("keydown", handleKeyDown); + return () => { + window.removeEventListener(CUSTOM_EVENTS.TOGGLE_VOICE_INPUT, handleToggle as EventListener); + window.removeEventListener("keydown", handleKeyDown); + }; + }, [voiceInput, setToast]); + // Auto-focus chat input when workspace changes (workspace only) const workspaceIdForFocus = variant === "workspace" ? props.workspaceId : null; useEffect(() => { @@ -768,6 +847,34 @@ export const ChatInput: React.FC = (props) => { return; } + // Handle voice input toggle (Ctrl+D / Cmd+D) + if (matchesKeybind(e, KEYBINDS.TOGGLE_VOICE_INPUT) && voiceInput.shouldShowUI) { + e.preventDefault(); + if (!voiceInput.isApiKeySet) { + setToast({ + id: Date.now().toString(), + type: "error", + message: "Voice input requires OpenAI API key. Configure in Settings → Providers.", + }); + return; + } + voiceInput.toggle(); + return; + } + + // Space on empty input starts voice recording + if ( + e.key === " " && + input.trim() === "" && + voiceInput.shouldShowUI && + voiceInput.isApiKeySet && + voiceInput.state === "idle" + ) { + e.preventDefault(); + voiceInput.start(); + return; + } + // Handle open model selector if (matchesKeybind(e, KEYBINDS.OPEN_MODEL_SELECTOR)) { e.preventDefault(); @@ -896,33 +1003,81 @@ export const ChatInput: React.FC = (props) => { anchorRef={variant === "creation" ? inputRef : undefined} /> -
- 0 ? commandListId : undefined - } - aria-expanded={showCommandSuggestions && commandSuggestions.length > 0} - /> +
+ {/* Recording/transcribing overlay - replaces textarea when active */} + {voiceInput.state !== "idle" ? ( + + ) : ( + <> + 0 + ? commandListId + : undefined + } + aria-expanded={showCommandSuggestions && commandSuggestions.length > 0} + /> + {/* Floating voice input button inside textarea */} +
+ +
+ + )}
{/* Image attachments */} -
+
{/* Editing indicator - workspace only */} {variant === "workspace" && editingMessage && (
@@ -930,7 +1085,7 @@ export const ChatInput: React.FC = (props) => {
)} -
+
{/* Model Selector - always visible */}
= (props) => { disabled={!canSend} aria-label="Send message" className={cn( - "inline-flex items-center gap-1 rounded-sm border border-border-light px-2 py-1 text-[11px] font-medium text-white transition-colors duration-200 disabled:opacity-50", + "inline-flex items-center gap-1 rounded-sm border border-border-light px-1.5 py-0.5 text-[11px] font-medium text-white transition-colors duration-200 disabled:opacity-50", mode === "plan" ? "bg-plan-mode hover:bg-plan-mode-hover disabled:hover:bg-plan-mode" : "bg-exec-mode hover:bg-exec-mode-hover disabled:hover:bg-exec-mode" diff --git a/src/browser/components/ToggleGroup.tsx b/src/browser/components/ToggleGroup.tsx index 8525c983c9..0f812f7b4c 100644 --- a/src/browser/components/ToggleGroup.tsx +++ b/src/browser/components/ToggleGroup.tsx @@ -30,7 +30,7 @@ export function ToggleGroup({ onClick={() => onChange(nextOption.value)} type="button" className={cn( - "px-2 py-1 text-[11px] font-sans rounded-sm border-none cursor-pointer transition-all duration-150", + "px-1.5 py-0.5 text-[11px] font-sans rounded-sm border-none cursor-pointer transition-all duration-150", "text-toggle-text-active bg-toggle-active font-medium", activeOption?.activeClassName )} @@ -52,7 +52,7 @@ export function ToggleGroup({ aria-pressed={isActive} type="button" className={cn( - "px-2 py-1 text-[11px] font-sans rounded-sm border-none cursor-pointer transition-all duration-150 bg-transparent", + "px-1.5 py-0.5 text-[11px] font-sans rounded-sm border-none cursor-pointer transition-all duration-150 bg-transparent", isActive ? "text-toggle-text-active bg-toggle-active font-medium" : "text-toggle-text font-normal hover:text-toggle-text-hover hover:bg-toggle-hover", diff --git a/src/browser/hooks/useVoiceInput.ts b/src/browser/hooks/useVoiceInput.ts new file mode 100644 index 0000000000..46d273d3dd --- /dev/null +++ b/src/browser/hooks/useVoiceInput.ts @@ -0,0 +1,248 @@ +/** + * Voice input via OpenAI transcription (gpt-4o-transcribe). + * + * State machine: idle → recording → transcribing → idle + * + * Hidden on touch devices where native keyboard dictation is available. + */ + +import { useState, useCallback, useRef, useEffect } from "react"; + +export type VoiceInputState = "idle" | "recording" | "transcribing"; + +export interface UseVoiceInputOptions { + onTranscript: (text: string) => void; + onError?: (error: string) => void; + /** Called after successful transcription if stop({ send: true }) was used */ + onSend?: () => void; + openAIKeySet: boolean; +} + +export interface UseVoiceInputResult { + state: VoiceInputState; + isSupported: boolean; + isApiKeySet: boolean; + /** False on touch devices (they have native keyboard dictation) */ + shouldShowUI: boolean; + start: () => void; + stop: (options?: { send?: boolean }) => void; + cancel: () => void; + toggle: () => void; +} + +// ============================================================================= +// Platform Detection +// ============================================================================= + +/** + * Detect touch devices where native keyboard dictation is typically available. + * This includes phones, tablets (iPad), and touch-enabled laptops in tablet mode. + * We hide our voice UI on these devices to avoid redundancy with system dictation. + */ +function hasTouchDictation(): boolean { + if (typeof window === "undefined") return false; + const hasTouch = "ontouchstart" in window || navigator.maxTouchPoints > 0; + // Touch-only check: most touch devices have native dictation. + // We don't check screen size because iPads are large but still have dictation. + return hasTouch; +} + +const HAS_TOUCH_DICTATION = hasTouchDictation(); +const HAS_MEDIA_RECORDER = typeof window !== "undefined" && typeof MediaRecorder !== "undefined"; + +// ============================================================================= +// Hook +// ============================================================================= + +export function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputResult { + const [state, setState] = useState("idle"); + + // Refs for MediaRecorder lifecycle + const recorderRef = useRef(null); + const streamRef = useRef(null); + const chunksRef = useRef([]); + + // Flags set before stopping to control post-stop behavior + const shouldSendRef = useRef(false); + const wasCancelledRef = useRef(false); + + // Keep callbacks fresh without recreating functions + const callbacksRef = useRef(options); + useEffect(() => { + callbacksRef.current = options; + }, [options]); + + // --------------------------------------------------------------------------- + // Transcription + // --------------------------------------------------------------------------- + + const transcribe = useCallback(async (audioBlob: Blob) => { + setState("transcribing"); + + // Capture and reset flags + const shouldSend = shouldSendRef.current; + shouldSendRef.current = false; + + try { + // Encode audio as base64 for IPC transport + const buffer = await audioBlob.arrayBuffer(); + const base64 = btoa( + new Uint8Array(buffer).reduce((str, byte) => str + String.fromCharCode(byte), "") + ); + + const result = await window.api.voice.transcribe(base64); + + if (!result.success) { + callbacksRef.current.onError?.(result.error); + return; + } + + const text = result.data.trim(); + if (!text) return; // Empty transcription, nothing to do + + callbacksRef.current.onTranscript(text); + + // If stop({ send: true }) was called, trigger send after React flushes + if (shouldSend) { + setTimeout(() => callbacksRef.current.onSend?.(), 0); + } + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + callbacksRef.current.onError?.(`Transcription failed: ${msg}`); + } finally { + setState("idle"); + } + }, []); + + // --------------------------------------------------------------------------- + // Release microphone and clean up recorder + // --------------------------------------------------------------------------- + + const releaseStream = useCallback(() => { + streamRef.current?.getTracks().forEach((t) => t.stop()); + streamRef.current = null; + }, []); + + // --------------------------------------------------------------------------- + // Start Recording + // --------------------------------------------------------------------------- + + const start = useCallback(async () => { + // Guard: only start from idle state with valid configuration + const canStart = + HAS_MEDIA_RECORDER && + !HAS_TOUCH_DICTATION && + state === "idle" && + callbacksRef.current.openAIKeySet; + + if (!canStart) return; + + try { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + streamRef.current = stream; + + const mimeType = MediaRecorder.isTypeSupported("audio/webm;codecs=opus") + ? "audio/webm;codecs=opus" + : "audio/webm"; + + const recorder = new MediaRecorder(stream, { mimeType }); + chunksRef.current = []; + + recorder.ondataavailable = (e) => { + if (e.data.size > 0) chunksRef.current.push(e.data); + }; + + recorder.onstop = () => { + // Check if this was a cancel (discard audio) or normal stop (transcribe) + const cancelled = wasCancelledRef.current; + wasCancelledRef.current = false; + + const blob = new Blob(chunksRef.current, { type: mimeType }); + chunksRef.current = []; + releaseStream(); + + if (cancelled) { + setState("idle"); + } else { + void transcribe(blob); + } + }; + + recorder.onerror = () => { + callbacksRef.current.onError?.("Recording failed"); + releaseStream(); + setState("idle"); + }; + + recorderRef.current = recorder; + recorder.start(); + setState("recording"); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + const isPermissionDenied = msg.includes("Permission denied") || msg.includes("NotAllowed"); + + callbacksRef.current.onError?.( + isPermissionDenied + ? "Microphone access denied. Please allow microphone access and try again." + : `Failed to start recording: ${msg}` + ); + } + }, [state, transcribe, releaseStream]); + + // --------------------------------------------------------------------------- + // Stop Recording (triggers transcription) + // --------------------------------------------------------------------------- + + const stop = useCallback((options?: { send?: boolean }) => { + if (options?.send) shouldSendRef.current = true; + + if (recorderRef.current?.state !== "inactive") { + recorderRef.current?.stop(); + recorderRef.current = null; + } + }, []); + + // --------------------------------------------------------------------------- + // Cancel Recording (discard audio, no transcription) + // --------------------------------------------------------------------------- + + const cancel = useCallback(() => { + wasCancelledRef.current = true; + stop(); + }, [stop]); + + // --------------------------------------------------------------------------- + // Toggle (convenience for keybinds) + // --------------------------------------------------------------------------- + + const toggle = useCallback(() => { + if (state === "recording") stop(); + else if (state === "idle") void start(); + }, [state, start, stop]); + + // --------------------------------------------------------------------------- + // Cleanup on unmount + // --------------------------------------------------------------------------- + + useEffect(() => { + return () => { + recorderRef.current?.stop(); + releaseStream(); + }; + }, [releaseStream]); + + // --------------------------------------------------------------------------- + // Return + // --------------------------------------------------------------------------- + + return { + state, + isSupported: HAS_MEDIA_RECORDER, + isApiKeySet: callbacksRef.current.openAIKeySet, + shouldShowUI: HAS_MEDIA_RECORDER && !HAS_TOUCH_DICTATION, + start: () => void start(), + stop, + cancel, + toggle, + }; +} diff --git a/src/browser/stories/App.chat.stories.tsx b/src/browser/stories/App.chat.stories.tsx index 9aa03e0a2e..295d9e5232 100644 --- a/src/browser/stories/App.chat.stories.tsx +++ b/src/browser/stories/App.chat.stories.tsx @@ -212,6 +212,32 @@ export const WithAgentStatus: AppStory = { ), }; +/** Voice input button shows user education when OpenAI API key is not set */ +export const VoiceInputNoApiKey: AppStory = { + render: () => ( + { + setupSimpleChatStory({ + messages: [], + // No OpenAI key configured - voice button should be disabled with tooltip + providersConfig: { + anthropic: { apiKeySet: true }, + // openai deliberately missing + }, + }); + }} + /> + ), + parameters: { + docs: { + description: { + story: + "Shows the voice input button in disabled state when OpenAI API key is not configured. Hover over the mic icon in the chat input to see the user education tooltip.", + }, + }, + }, +}; + /** Streaming/working state with pending tool call */ export const Streaming: AppStory = { render: () => ( diff --git a/src/browser/stories/mockFactory.ts b/src/browser/stories/mockFactory.ts index 29916ebca6..3de5878047 100644 --- a/src/browser/stories/mockFactory.ts +++ b/src/browser/stories/mockFactory.ts @@ -456,6 +456,9 @@ export function createMockAPI(options: MockAPIOptions): IPCApi { openWindow: () => Promise.resolve(undefined), closeWindow: () => Promise.resolve(undefined), }, + voice: { + transcribe: () => Promise.resolve({ success: false, error: "Not implemented in mock" }), + }, update: { check: () => Promise.resolve(undefined), download: () => Promise.resolve(undefined), diff --git a/src/browser/stories/storyHelpers.ts b/src/browser/stories/storyHelpers.ts index d35544ba79..3b555e69c3 100644 --- a/src/browser/stories/storyHelpers.ts +++ b/src/browser/stories/storyHelpers.ts @@ -55,6 +55,7 @@ export interface SimpleChatSetupOptions { projectName?: string; messages: MuxMessage[]; gitStatus?: GitStatusFixture; + providersConfig?: Record; } /** @@ -82,6 +83,7 @@ export function setupSimpleChatStory(opts: SimpleChatSetupOptions): void { workspaces, chatHandlers, gitStatus, + providersConfig: opts.providersConfig, }) ); diff --git a/src/browser/utils/commandIds.ts b/src/browser/utils/commandIds.ts index 8976082fc4..cc92aa0057 100644 --- a/src/browser/utils/commandIds.ts +++ b/src/browser/utils/commandIds.ts @@ -39,6 +39,7 @@ export const CommandIds = { chatTruncate: (pct: number) => `${COMMAND_ID_PREFIXES.CHAT_TRUNCATE}${pct}` as const, chatInterrupt: () => "chat:interrupt" as const, chatJumpBottom: () => "chat:jumpBottom" as const, + chatVoiceInput: () => "chat:voiceInput" as const, // Mode commands modeToggle: () => "mode:toggle" as const, diff --git a/src/browser/utils/commands/sources.ts b/src/browser/utils/commands/sources.ts index 09029e5f44..8738f6b5ee 100644 --- a/src/browser/utils/commands/sources.ts +++ b/src/browser/utils/commands/sources.ts @@ -388,6 +388,17 @@ export function buildCoreSources(p: BuildSourcesParams): Array<() => CommandActi window.dispatchEvent(ev); }, }); + list.push({ + id: CommandIds.chatVoiceInput(), + title: "Toggle Voice Input", + subtitle: "Dictate instead of typing", + section: section.chat, + shortcutHint: formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT), + run: () => { + // Dispatch custom event; ChatInput listens for it + window.dispatchEvent(createCustomEvent(CUSTOM_EVENTS.TOGGLE_VOICE_INPUT)); + }, + }); } return list; }); diff --git a/src/browser/utils/ui/keybinds.ts b/src/browser/utils/ui/keybinds.ts index 0a85f645b4..0bc979c542 100644 --- a/src/browser/utils/ui/keybinds.ts +++ b/src/browser/utils/ui/keybinds.ts @@ -285,4 +285,9 @@ export const KEYBINDS = { /** Open settings modal */ // macOS: Cmd+, Win/Linux: Ctrl+, OPEN_SETTINGS: { key: ",", ctrl: true }, + + /** Toggle voice input (dictation) */ + // macOS: Cmd+D, Win/Linux: Ctrl+D + // "D" for Dictate - intuitive and available + TOGGLE_VOICE_INPUT: { key: "d", ctrl: true }, } as const; diff --git a/src/common/constants/events.ts b/src/common/constants/events.ts index ccbd592113..807bc226fa 100644 --- a/src/common/constants/events.ts +++ b/src/common/constants/events.ts @@ -56,6 +56,12 @@ export const CUSTOM_EVENTS = { * Detail: { projectPath: string, startMessage?: string, model?: string, trunkBranch?: string, runtime?: string } */ START_WORKSPACE_CREATION: "mux:startWorkspaceCreation", + + /** + * Event to toggle voice input (dictation) mode + * No detail + */ + TOGGLE_VOICE_INPUT: "mux:toggleVoiceInput", } as const; /** @@ -94,6 +100,7 @@ export interface CustomEventPayloads { trunkBranch?: string; runtime?: string; }; + [CUSTOM_EVENTS.TOGGLE_VOICE_INPUT]: never; // No payload } /** diff --git a/src/common/constants/ipc-constants.ts b/src/common/constants/ipc-constants.ts index be7bc45ccf..c335928a09 100644 --- a/src/common/constants/ipc-constants.ts +++ b/src/common/constants/ipc-constants.ts @@ -68,6 +68,9 @@ export const IPC_CHANNELS = { TOKENIZER_COUNT_TOKENS: "tokenizer:countTokens", TOKENIZER_COUNT_TOKENS_BATCH: "tokenizer:countTokensBatch", + // Voice channels + VOICE_TRANSCRIBE: "voice:transcribe", + // Dynamic channel prefixes WORKSPACE_CHAT_PREFIX: "workspace:chat:", WORKSPACE_METADATA: "workspace:metadata", diff --git a/src/common/types/ipc.ts b/src/common/types/ipc.ts index 22f844e504..d07846257e 100644 --- a/src/common/types/ipc.ts +++ b/src/common/types/ipc.ts @@ -371,6 +371,10 @@ export interface IPCApi { openWindow(workspaceId: string): Promise; closeWindow(workspaceId: string): Promise; }; + voice: { + /** Transcribe audio using OpenAI Whisper. Audio should be base64-encoded webm/opus. */ + transcribe(audioBase64: string): Promise>; + }; update: { check(): Promise; download(): Promise; diff --git a/src/desktop/preload.ts b/src/desktop/preload.ts index 8a9ea1c71c..8ac8a5f8d7 100644 --- a/src/desktop/preload.ts +++ b/src/desktop/preload.ts @@ -160,6 +160,10 @@ const api: IPCApi = { window: { setTitle: (title: string) => ipcRenderer.invoke(IPC_CHANNELS.WINDOW_SET_TITLE, title), }, + voice: { + transcribe: (audioBase64: string) => + ipcRenderer.invoke(IPC_CHANNELS.VOICE_TRANSCRIBE, audioBase64), + }, update: { check: () => ipcRenderer.invoke(IPC_CHANNELS.UPDATE_CHECK), download: () => ipcRenderer.invoke(IPC_CHANNELS.UPDATE_DOWNLOAD), diff --git a/src/node/services/ipcMain.ts b/src/node/services/ipcMain.ts index 7de28a0d7e..ebd89108bc 100644 --- a/src/node/services/ipcMain.ts +++ b/src/node/services/ipcMain.ts @@ -44,6 +44,7 @@ import { PTYService } from "@/node/services/ptyService"; import type { TerminalWindowManager } from "@/desktop/terminalWindowManager"; import type { TerminalCreateParams, TerminalResizeParams } from "@/common/types/terminal"; import { ExtensionMetadataService } from "@/node/services/ExtensionMetadataService"; +import OpenAI from "openai"; /** Maximum number of retry attempts when workspace name collides */ const MAX_WORKSPACE_NAME_COLLISION_RETRIES = 3; @@ -633,6 +634,45 @@ export class IpcMain { } private registerWorkspaceHandlers(ipcMain: ElectronIpcMain): void { + // Voice transcription handler (uses OpenAI Whisper) + ipcMain.handle( + IPC_CHANNELS.VOICE_TRANSCRIBE, + async (_event, audioBase64: string): Promise> => { + try { + // Get OpenAI config + const providersConfig = this.config.loadProvidersConfig(); + const openaiConfig = providersConfig?.openai; + + if (!openaiConfig?.apiKey) { + return Err("OpenAI API key not configured. Set it in Settings > Providers."); + } + + const client = new OpenAI({ + apiKey: openaiConfig.apiKey, + baseURL: openaiConfig.baseUrl ?? openaiConfig.baseURL, + }); + + // Convert base64 to buffer + const audioBuffer = Buffer.from(audioBase64, "base64"); + + // Create a File object for the API + const audioFile = new File([audioBuffer], "audio.webm", { type: "audio/webm" }); + + // Call Whisper API + const transcription = await client.audio.transcriptions.create({ + file: audioFile, + model: "gpt-4o-transcribe", + }); + + return Ok(transcription.text); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + log.error("[IpcMain] Voice transcription failed", error); + return Err(`Transcription failed: ${message}`); + } + } + ); + ipcMain.handle( IPC_CHANNELS.WORKSPACE_CREATE, async ( diff --git a/tests/e2e/scenarios/settings.spec.ts b/tests/e2e/scenarios/settings.spec.ts index a6205d36e1..4ce2c4b15f 100644 --- a/tests/e2e/scenarios/settings.spec.ts +++ b/tests/e2e/scenarios/settings.spec.ts @@ -97,7 +97,9 @@ test.describe("Settings Modal", () => { // Verify all providers are listed with correct display names await expect(page.getByRole("button", { name: /Anthropic/i })).toBeVisible(); - await expect(page.getByRole("button", { name: /OpenAI/i })).toBeVisible(); + await expect( + page.getByRole("button", { name: /OpenAI/i }).filter({ has: page.getByText("OpenAI icon") }) + ).toBeVisible(); await expect(page.getByRole("button", { name: /Google/i })).toBeVisible(); await expect(page.getByRole("button", { name: /xAI/i })).toBeVisible(); await expect(page.getByRole("button", { name: /Ollama/i })).toBeVisible();