From d14d80fe62eade0bc4688f185b619fd1ee75b635 Mon Sep 17 00:00:00 2001 From: Vikhyath Mondreti Date: Sat, 13 Dec 2025 11:28:16 -0800 Subject: [PATCH 1/4] fix(deployed-chat): voice mode --- apps/sim/app/api/proxy/tts/stream/route.ts | 81 +++++++++++++++++-- apps/sim/app/chat/[identifier]/chat.tsx | 12 ++- .../sim/app/chat/hooks/use-audio-streaming.ts | 11 ++- 3 files changed, 95 insertions(+), 9 deletions(-) diff --git a/apps/sim/app/api/proxy/tts/stream/route.ts b/apps/sim/app/api/proxy/tts/stream/route.ts index 84c8c05b0f..7e72fe01f5 100644 --- a/apps/sim/app/api/proxy/tts/stream/route.ts +++ b/apps/sim/app/api/proxy/tts/stream/route.ts @@ -1,21 +1,90 @@ +import { db } from '@sim/db' +import { chat } from '@sim/db/schema' +import { eq } from 'drizzle-orm' import type { NextRequest } from 'next/server' import { checkHybridAuth } from '@/lib/auth/hybrid' import { env } from '@/lib/core/config/env' import { validateAlphanumericId } from '@/lib/core/security/input-validation' import { createLogger } from '@/lib/logs/console/logger' +import { validateAuthToken } from '@/app/api/chat/utils' const logger = createLogger('ProxyTTSStreamAPI') +/** + * Validates chat-based authentication for deployed chat voice mode + * Checks if the user has a valid chat auth cookie for the given chatId + */ +async function validateChatAuth(request: NextRequest, chatId: string): Promise { + try { + // Verify the chat exists and is active + const chatResult = await db + .select({ id: chat.id, isActive: chat.isActive }) + .from(chat) + .where(eq(chat.id, chatId)) + .limit(1) + + if (chatResult.length === 0 || !chatResult[0].isActive) { + logger.warn('Chat not found or inactive for TTS auth:', chatId) + return false + } + + // Check for chat auth cookie + const cookieName = `chat_auth_${chatId}` + const authCookie = request.cookies.get(cookieName) + + if (authCookie && validateAuthToken(authCookie.value, chatId)) { + return true + } + + // For public chats, allow TTS without cookie (the chat itself handles access control) + // We already verified the chat exists and is active above + const [chatData] = await db + .select({ authType: chat.authType }) + .from(chat) + .where(eq(chat.id, chatId)) + .limit(1) + + if (chatData?.authType === 'public') { + return true + } + + return false + } catch (error) { + logger.error('Error validating chat auth for TTS:', error) + return false + } +} + export async function POST(request: NextRequest) { try { - const authResult = await checkHybridAuth(request, { requireWorkflowId: false }) - if (!authResult.success) { - logger.error('Authentication failed for TTS stream proxy:', authResult.error) - return new Response('Unauthorized', { status: 401 }) + // Clone request to read body for chatId check before hybrid auth consumes it + const clonedRequest = request.clone() + let body: any + try { + body = await clonedRequest.json() + } catch { + return new Response('Invalid request body', { status: 400 }) } - const body = await request.json() - const { text, voiceId, modelId = 'eleven_turbo_v2_5' } = body + const { text, voiceId, modelId = 'eleven_turbo_v2_5', chatId } = body + + // Try chat-based authentication first if chatId is provided + if (chatId) { + const isChatAuthed = await validateChatAuth(request, chatId) + if (isChatAuthed) { + logger.info('TTS request authenticated via chat auth for chatId:', chatId) + } else { + logger.warn('Chat authentication failed for TTS, chatId:', chatId) + return new Response('Unauthorized', { status: 401 }) + } + } else { + // Fall back to standard hybrid auth (session, API key, or internal JWT) + const authResult = await checkHybridAuth(request, { requireWorkflowId: false }) + if (!authResult.success) { + logger.error('Authentication failed for TTS stream proxy:', authResult.error) + return new Response('Unauthorized', { status: 401 }) + } + } if (!text || !voiceId) { return new Response('Missing required parameters', { status: 400 }) diff --git a/apps/sim/app/chat/[identifier]/chat.tsx b/apps/sim/app/chat/[identifier]/chat.tsx index fe63fbf18f..40f9f1b8a0 100644 --- a/apps/sim/app/chat/[identifier]/chat.tsx +++ b/apps/sim/app/chat/[identifier]/chat.tsx @@ -39,6 +39,7 @@ interface ChatConfig { interface AudioStreamingOptions { voiceId: string + chatId?: string onError: (error: Error) => void } @@ -62,16 +63,19 @@ function fileToBase64(file: File): Promise { * Creates an audio stream handler for text-to-speech conversion * @param streamTextToAudio - Function to stream text to audio * @param voiceId - The voice ID to use for TTS + * @param chatId - Optional chat ID for deployed chat authentication * @returns Audio stream handler function or undefined */ function createAudioStreamHandler( streamTextToAudio: (text: string, options: AudioStreamingOptions) => Promise, - voiceId: string + voiceId: string, + chatId?: string ) { return async (text: string) => { try { await streamTextToAudio(text, { voiceId, + chatId, onError: (error: Error) => { logger.error('Audio streaming error:', error) }, @@ -391,7 +395,11 @@ export default function ChatClient({ identifier }: { identifier: string }) { // Use the streaming hook with audio support const shouldPlayAudio = isVoiceInput || isVoiceFirstMode const audioHandler = shouldPlayAudio - ? createAudioStreamHandler(streamTextToAudio, DEFAULT_VOICE_SETTINGS.voiceId) + ? createAudioStreamHandler( + streamTextToAudio, + DEFAULT_VOICE_SETTINGS.voiceId, + chatConfig?.id + ) : undefined logger.info('Starting to handle streamed response:', { shouldPlayAudio }) diff --git a/apps/sim/app/chat/hooks/use-audio-streaming.ts b/apps/sim/app/chat/hooks/use-audio-streaming.ts index 0cd8ebdb42..b37e7ac83c 100644 --- a/apps/sim/app/chat/hooks/use-audio-streaming.ts +++ b/apps/sim/app/chat/hooks/use-audio-streaming.ts @@ -14,6 +14,7 @@ declare global { interface AudioStreamingOptions { voiceId: string modelId?: string + chatId?: string onAudioStart?: () => void onAudioEnd?: () => void onError?: (error: Error) => void @@ -76,7 +77,14 @@ export function useAudioStreaming(sharedAudioContextRef?: RefObject Date: Sat, 13 Dec 2025 11:34:56 -0800 Subject: [PATCH 2/4] remove redundant check --- apps/sim/app/api/proxy/tts/stream/route.ts | 29 +++++++--------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/apps/sim/app/api/proxy/tts/stream/route.ts b/apps/sim/app/api/proxy/tts/stream/route.ts index 7e72fe01f5..b2a549a131 100644 --- a/apps/sim/app/api/proxy/tts/stream/route.ts +++ b/apps/sim/app/api/proxy/tts/stream/route.ts @@ -2,7 +2,6 @@ import { db } from '@sim/db' import { chat } from '@sim/db/schema' import { eq } from 'drizzle-orm' import type { NextRequest } from 'next/server' -import { checkHybridAuth } from '@/lib/auth/hybrid' import { env } from '@/lib/core/config/env' import { validateAlphanumericId } from '@/lib/core/security/input-validation' import { createLogger } from '@/lib/logs/console/logger' @@ -57,39 +56,29 @@ async function validateChatAuth(request: NextRequest, chatId: string): Promise Date: Sat, 13 Dec 2025 11:49:03 -0800 Subject: [PATCH 3/4] consolidate query --- apps/sim/app/api/proxy/tts/stream/route.ts | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/apps/sim/app/api/proxy/tts/stream/route.ts b/apps/sim/app/api/proxy/tts/stream/route.ts index b2a549a131..cc9f8f1455 100644 --- a/apps/sim/app/api/proxy/tts/stream/route.ts +++ b/apps/sim/app/api/proxy/tts/stream/route.ts @@ -15,9 +15,8 @@ const logger = createLogger('ProxyTTSStreamAPI') */ async function validateChatAuth(request: NextRequest, chatId: string): Promise { try { - // Verify the chat exists and is active const chatResult = await db - .select({ id: chat.id, isActive: chat.isActive }) + .select({ id: chat.id, isActive: chat.isActive, authType: chat.authType }) .from(chat) .where(eq(chat.id, chatId)) .limit(1) @@ -27,23 +26,18 @@ async function validateChatAuth(request: NextRequest, chatId: string): Promise Date: Sat, 13 Dec 2025 12:25:23 -0800 Subject: [PATCH 4/4] invalidate session on password change + race condition fix --- apps/sim/app/(landing)/actions/github.ts | 2 +- apps/sim/app/api/chat/[identifier]/route.ts | 4 +- apps/sim/app/api/chat/utils.ts | 41 +++++++--- apps/sim/app/api/proxy/tts/stream/route.ts | 11 ++- apps/sim/app/api/stars/route.ts | 6 +- apps/sim/app/chat/[identifier]/chat.tsx | 2 +- .../voice-interface/voice-interface.tsx | 80 ++++--------------- 7 files changed, 61 insertions(+), 85 deletions(-) diff --git a/apps/sim/app/(landing)/actions/github.ts b/apps/sim/app/(landing)/actions/github.ts index 527f29ea44..42f586a956 100644 --- a/apps/sim/app/(landing)/actions/github.ts +++ b/apps/sim/app/(landing)/actions/github.ts @@ -1,6 +1,6 @@ import { createLogger } from '@/lib/logs/console/logger' -const DEFAULT_STARS = '18.6k' +const DEFAULT_STARS = '19.4k' const logger = createLogger('GitHubStars') diff --git a/apps/sim/app/api/chat/[identifier]/route.ts b/apps/sim/app/api/chat/[identifier]/route.ts index eefb9ca997..44e9e524a9 100644 --- a/apps/sim/app/api/chat/[identifier]/route.ts +++ b/apps/sim/app/api/chat/[identifier]/route.ts @@ -132,7 +132,7 @@ export async function POST( if ((password || email) && !input) { const response = addCorsHeaders(createSuccessResponse({ authenticated: true }), request) - setChatAuthCookie(response, deployment.id, deployment.authType) + setChatAuthCookie(response, deployment.id, deployment.authType, deployment.password) return response } @@ -315,7 +315,7 @@ export async function GET( if ( deployment.authType !== 'public' && authCookie && - validateAuthToken(authCookie.value, deployment.id) + validateAuthToken(authCookie.value, deployment.id, deployment.password) ) { return addCorsHeaders( createSuccessResponse({ diff --git a/apps/sim/app/api/chat/utils.ts b/apps/sim/app/api/chat/utils.ts index 1e41f92012..c8b76d92fc 100644 --- a/apps/sim/app/api/chat/utils.ts +++ b/apps/sim/app/api/chat/utils.ts @@ -1,3 +1,4 @@ +import { createHash } from 'crypto' import { db } from '@sim/db' import { chat, workflow } from '@sim/db/schema' import { eq } from 'drizzle-orm' @@ -9,6 +10,10 @@ import { hasAdminPermission } from '@/lib/workspaces/permissions/utils' const logger = createLogger('ChatAuthUtils') +function hashPassword(encryptedPassword: string): string { + return createHash('sha256').update(encryptedPassword).digest('hex').substring(0, 8) +} + /** * Check if user has permission to create a chat for a specific workflow * Either the user owns the workflow directly OR has admin permission for the workflow's workspace @@ -77,14 +82,20 @@ export async function checkChatAccess( return { hasAccess: false } } -const encryptAuthToken = (chatId: string, type: string): string => { - return Buffer.from(`${chatId}:${type}:${Date.now()}`).toString('base64') +function encryptAuthToken(chatId: string, type: string, encryptedPassword?: string | null): string { + const pwHash = encryptedPassword ? hashPassword(encryptedPassword) : '' + return Buffer.from(`${chatId}:${type}:${Date.now()}:${pwHash}`).toString('base64') } -export const validateAuthToken = (token: string, chatId: string): boolean => { +export function validateAuthToken( + token: string, + chatId: string, + encryptedPassword?: string | null +): boolean { try { const decoded = Buffer.from(token, 'base64').toString() - const [storedId, _type, timestamp] = decoded.split(':') + const parts = decoded.split(':') + const [storedId, _type, timestamp, storedPwHash] = parts if (storedId !== chatId) { return false @@ -92,20 +103,32 @@ export const validateAuthToken = (token: string, chatId: string): boolean => { const createdAt = Number.parseInt(timestamp) const now = Date.now() - const expireTime = 24 * 60 * 60 * 1000 // 24 hours + const expireTime = 24 * 60 * 60 * 1000 if (now - createdAt > expireTime) { return false } + if (encryptedPassword) { + const currentPwHash = hashPassword(encryptedPassword) + if (storedPwHash !== currentPwHash) { + return false + } + } + return true } catch (_e) { return false } } -export const setChatAuthCookie = (response: NextResponse, chatId: string, type: string): void => { - const token = encryptAuthToken(chatId, type) +export function setChatAuthCookie( + response: NextResponse, + chatId: string, + type: string, + encryptedPassword?: string | null +): void { + const token = encryptAuthToken(chatId, type, encryptedPassword) response.cookies.set({ name: `chat_auth_${chatId}`, value: token, @@ -113,7 +136,7 @@ export const setChatAuthCookie = (response: NextResponse, chatId: string, type: secure: !isDev, sameSite: 'lax', path: '/', - maxAge: 60 * 60 * 24, // 24 hours + maxAge: 60 * 60 * 24, }) } @@ -145,7 +168,7 @@ export async function validateChatAuth( const cookieName = `chat_auth_${deployment.id}` const authCookie = request.cookies.get(cookieName) - if (authCookie && validateAuthToken(authCookie.value, deployment.id)) { + if (authCookie && validateAuthToken(authCookie.value, deployment.id, deployment.password)) { return { authorized: true } } diff --git a/apps/sim/app/api/proxy/tts/stream/route.ts b/apps/sim/app/api/proxy/tts/stream/route.ts index cc9f8f1455..316c0d0a0a 100644 --- a/apps/sim/app/api/proxy/tts/stream/route.ts +++ b/apps/sim/app/api/proxy/tts/stream/route.ts @@ -16,7 +16,12 @@ const logger = createLogger('ProxyTTSStreamAPI') async function validateChatAuth(request: NextRequest, chatId: string): Promise { try { const chatResult = await db - .select({ id: chat.id, isActive: chat.isActive, authType: chat.authType }) + .select({ + id: chat.id, + isActive: chat.isActive, + authType: chat.authType, + password: chat.password, + }) .from(chat) .where(eq(chat.id, chatId)) .limit(1) @@ -28,16 +33,14 @@ async function validateChatAuth(request: NextRequest, chatId: string): Promise(null) const messagesEndRef = useRef(null) const messagesContainerRef = useRef(null) - const [starCount, setStarCount] = useState('3.4k') + const [starCount, setStarCount] = useState('19.4k') const [conversationId, setConversationId] = useState('') const [showScrollButton, setShowScrollButton] = useState(false) diff --git a/apps/sim/app/chat/components/voice-interface/voice-interface.tsx b/apps/sim/app/chat/components/voice-interface/voice-interface.tsx index a4f2ad095e..d4dc002ff2 100644 --- a/apps/sim/app/chat/components/voice-interface/voice-interface.tsx +++ b/apps/sim/app/chat/components/voice-interface/voice-interface.tsx @@ -68,7 +68,6 @@ export function VoiceInterface({ messages = [], className, }: VoiceInterfaceProps) { - // Simple state machine const [state, setState] = useState<'idle' | 'listening' | 'agent_speaking'>('idle') const [isInitialized, setIsInitialized] = useState(false) const [isMuted, setIsMuted] = useState(false) @@ -76,12 +75,10 @@ export function VoiceInterface({ const [permissionStatus, setPermissionStatus] = useState<'prompt' | 'granted' | 'denied'>( 'prompt' ) - - // Current turn transcript (subtitle) const [currentTranscript, setCurrentTranscript] = useState('') - // State tracking const currentStateRef = useRef<'idle' | 'listening' | 'agent_speaking'>('idle') + const isCallEndedRef = useRef(false) useEffect(() => { currentStateRef.current = state @@ -98,12 +95,10 @@ export function VoiceInterface({ const isSupported = typeof window !== 'undefined' && !!(window.SpeechRecognition || window.webkitSpeechRecognition) - // Update muted ref useEffect(() => { isMutedRef.current = isMuted }, [isMuted]) - // Timeout to handle cases where agent doesn't provide audio response const setResponseTimeout = useCallback(() => { if (responseTimeoutRef.current) { clearTimeout(responseTimeoutRef.current) @@ -113,7 +108,7 @@ export function VoiceInterface({ if (currentStateRef.current === 'listening') { setState('idle') } - }, 5000) // 5 second timeout (increased from 3) + }, 5000) }, []) const clearResponseTimeout = useCallback(() => { @@ -123,14 +118,12 @@ export function VoiceInterface({ } }, []) - // Sync with external state useEffect(() => { if (isPlayingAudio && state !== 'agent_speaking') { - clearResponseTimeout() // Clear timeout since agent is responding + clearResponseTimeout() setState('agent_speaking') setCurrentTranscript('') - // Mute microphone immediately setIsMuted(true) if (mediaStreamRef.current) { mediaStreamRef.current.getAudioTracks().forEach((track) => { @@ -138,7 +131,6 @@ export function VoiceInterface({ }) } - // Stop speech recognition completely if (recognitionRef.current) { try { recognitionRef.current.abort() @@ -150,7 +142,6 @@ export function VoiceInterface({ setState('idle') setCurrentTranscript('') - // Re-enable microphone setIsMuted(false) if (mediaStreamRef.current) { mediaStreamRef.current.getAudioTracks().forEach((track) => { @@ -160,7 +151,6 @@ export function VoiceInterface({ } }, [isPlayingAudio, state, clearResponseTimeout]) - // Audio setup const setupAudio = useCallback(async () => { try { const stream = await navigator.mediaDevices.getUserMedia({ @@ -175,7 +165,6 @@ export function VoiceInterface({ setPermissionStatus('granted') mediaStreamRef.current = stream - // Setup audio context for visualization if (!audioContextRef.current) { const AudioContext = window.AudioContext || window.webkitAudioContext audioContextRef.current = new AudioContext() @@ -194,7 +183,6 @@ export function VoiceInterface({ source.connect(analyser) analyserRef.current = analyser - // Start visualization const updateVisualization = () => { if (!analyserRef.current) return @@ -223,7 +211,6 @@ export function VoiceInterface({ } }, []) - // Speech recognition setup const setupSpeechRecognition = useCallback(() => { if (!isSupported) return @@ -259,14 +246,11 @@ export function VoiceInterface({ } } - // Update live transcript setCurrentTranscript(interimTranscript || finalTranscript) - // Send final transcript (but keep listening state until agent responds) if (finalTranscript.trim()) { - setCurrentTranscript('') // Clear transcript + setCurrentTranscript('') - // Stop recognition to avoid interference while waiting for response if (recognitionRef.current) { try { recognitionRef.current.stop() @@ -275,7 +259,6 @@ export function VoiceInterface({ } } - // Start timeout in case agent doesn't provide audio response setResponseTimeout() onVoiceTranscript?.(finalTranscript) @@ -283,13 +266,14 @@ export function VoiceInterface({ } recognition.onend = () => { + if (isCallEndedRef.current) return + const currentState = currentStateRef.current - // Only restart recognition if we're in listening state and not muted if (currentState === 'listening' && !isMutedRef.current) { - // Add a delay to avoid immediate restart after sending transcript setTimeout(() => { - // Double-check state hasn't changed during delay + if (isCallEndedRef.current) return + if ( recognitionRef.current && currentStateRef.current === 'listening' && @@ -301,14 +285,12 @@ export function VoiceInterface({ logger.debug('Error restarting speech recognition:', error) } } - }, 1000) // Longer delay to give agent time to respond + }, 1000) } } recognition.onerror = (event: SpeechRecognitionErrorEvent) => { - // Filter out "aborted" errors - these are expected when we intentionally stop recognition if (event.error === 'aborted') { - // Ignore return } @@ -320,7 +302,6 @@ export function VoiceInterface({ recognitionRef.current = recognition }, [isSupported, onVoiceTranscript, setResponseTimeout]) - // Start/stop listening const startListening = useCallback(() => { if (!isInitialized || isMuted || state !== 'idle') { return @@ -351,17 +332,12 @@ export function VoiceInterface({ } }, []) - // Handle interrupt const handleInterrupt = useCallback(() => { if (state === 'agent_speaking') { - // Clear any subtitle timeouts and text - // (No longer needed after removing subtitle system) - onInterrupt?.() setState('listening') setCurrentTranscript('') - // Unmute microphone for user input setIsMuted(false) if (mediaStreamRef.current) { mediaStreamRef.current.getAudioTracks().forEach((track) => { @@ -369,7 +345,6 @@ export function VoiceInterface({ }) } - // Start listening immediately if (recognitionRef.current) { try { recognitionRef.current.start() @@ -380,14 +355,13 @@ export function VoiceInterface({ } }, [state, onInterrupt]) - // Handle call end with proper cleanup const handleCallEnd = useCallback(() => { - // Stop everything immediately + isCallEndedRef.current = true + setState('idle') setCurrentTranscript('') setIsMuted(false) - // Stop speech recognition if (recognitionRef.current) { try { recognitionRef.current.abort() @@ -396,17 +370,11 @@ export function VoiceInterface({ } } - // Clear timeouts clearResponseTimeout() - - // Stop audio playback and streaming immediately onInterrupt?.() - - // Call the original onCallEnd onCallEnd?.() }, [onCallEnd, onInterrupt, clearResponseTimeout]) - // Keyboard handler useEffect(() => { const handleKeyDown = (event: KeyboardEvent) => { if (event.code === 'Space') { @@ -419,7 +387,6 @@ export function VoiceInterface({ return () => document.removeEventListener('keydown', handleKeyDown) }, [handleInterrupt]) - // Mute toggle const toggleMute = useCallback(() => { if (state === 'agent_speaking') { handleInterrupt() @@ -442,7 +409,6 @@ export function VoiceInterface({ } }, [isMuted, state, handleInterrupt, stopListening, startListening]) - // Initialize useEffect(() => { if (isSupported) { setupSpeechRecognition() @@ -450,47 +416,40 @@ export function VoiceInterface({ } }, [isSupported, setupSpeechRecognition, setupAudio]) - // Auto-start listening when ready useEffect(() => { if (isInitialized && !isMuted && state === 'idle') { startListening() } }, [isInitialized, isMuted, state, startListening]) - // Cleanup when call ends or component unmounts useEffect(() => { return () => { - // Stop speech recognition + isCallEndedRef.current = true + if (recognitionRef.current) { try { recognitionRef.current.abort() - } catch (error) { + } catch (_e) { // Ignore } recognitionRef.current = null } - // Stop media stream if (mediaStreamRef.current) { - mediaStreamRef.current.getTracks().forEach((track) => { - track.stop() - }) + mediaStreamRef.current.getTracks().forEach((track) => track.stop()) mediaStreamRef.current = null } - // Stop audio context if (audioContextRef.current) { audioContextRef.current.close() audioContextRef.current = null } - // Cancel animation frame if (animationFrameRef.current) { cancelAnimationFrame(animationFrameRef.current) animationFrameRef.current = null } - // Clear timeouts if (responseTimeoutRef.current) { clearTimeout(responseTimeoutRef.current) responseTimeoutRef.current = null @@ -498,7 +457,6 @@ export function VoiceInterface({ } }, []) - // Get status text const getStatusText = () => { switch (state) { case 'listening': @@ -510,7 +468,6 @@ export function VoiceInterface({ } } - // Get button content const getButtonContent = () => { if (state === 'agent_speaking') { return ( @@ -524,9 +481,7 @@ export function VoiceInterface({ return (
- {/* Main content */}
- {/* Voice visualization */}
- {/* Live transcript - subtitle style */}
{currentTranscript && (
@@ -549,17 +503,14 @@ export function VoiceInterface({ )}
- {/* Status */}

{getStatusText()} {isMuted && (Muted)}

- {/* Controls */}
- {/* End call */} - {/* Mic/Stop button */}