diff --git a/apps/llm/app/voice_chat/index.tsx b/apps/llm/app/voice_chat/index.tsx index 79a713c93..0bf4c9b30 100644 --- a/apps/llm/app/voice_chat/index.tsx +++ b/apps/llm/app/voice_chat/index.tsx @@ -76,7 +76,11 @@ function VoiceChatScreen() { }); recorder.start(); const transcription = await speechToText.stream(); - await llm.sendMessage(transcription); + await llm.sendMessage( + typeof transcription === 'string' + ? transcription + : transcription.map((w) => w.word).join(' ') + ); } }; @@ -105,7 +109,13 @@ function VoiceChatScreen() { ...llm.messageHistory, { role: 'user', - content: speechToText.committedTranscription, + content: + typeof speechToText.committedTranscription === + 'string' + ? speechToText.committedTranscription + : speechToText.committedTranscription + .map((w) => w.word) + .join(' '), }, ] : llm.messageHistory diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index 1e4525986..98fff26a4 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -8,9 +8,14 @@ import { TextInput, KeyboardAvoidingView, Platform, + Switch, } from 'react-native'; import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; -import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch'; +import { + useSpeechToText, + WHISPER_TINY_EN, + Word, +} from 'react-native-executorch'; import FontAwesome from '@expo/vector-icons/FontAwesome'; import { AudioManager, @@ -28,7 +33,10 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { model: WHISPER_TINY_EN, }); - const [transcription, setTranscription] = useState(''); + const [transcription, setTranscription] = useState(''); + + const [enableTimestamps, setEnableTimestamps] = useState(false); + const [audioURL, setAudioURL] = useState(''); const [liveTranscribing, setLiveTranscribing] = useState(false); const scrollViewRef = useRef(null); @@ -50,6 +58,15 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { AudioManager.requestRecordingPermissions(); }, []); + const getText = (data: string | Word[] | undefined) => { + if (!data) return ''; + if (typeof data === 'string') return data; + + return data + .map((w) => `${w.word} (${w.start.toFixed(2)}s - ${w.end.toFixed(2)}s)\n`) + .join(''); + }; + const handleTranscribeFromURL = async () => { if (!audioURL.trim()) { console.warn('Please provide a valid audio file URL'); @@ -66,7 +83,11 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { try { const decodedAudioData = await audioContext.decodeAudioDataSource(uri); const audioBuffer = decodedAudioData.getChannelData(0); - setTranscription(await model.transcribe(audioBuffer)); + + const result = await model.transcribe(audioBuffer, { + enableTimestamps: enableTimestamps as any, + }); + setTranscription(result); } catch (error) { console.error('Error decoding audio data', error); console.warn('Note: Supported file formats: mp3, wav, flac'); @@ -76,14 +97,15 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { const handleStartTranscribeFromMicrophone = async () => { setLiveTranscribing(true); - setTranscription(''); + setTranscription(enableTimestamps ? [] : ''); + recorder.onAudioReady(({ buffer }) => { model.streamInsert(buffer.getChannelData(0)); }); recorder.start(); try { - await model.stream(); + await model.stream({ enableTimestamps: enableTimestamps }); } catch (error) { console.error('Error during live transcription:', error); } @@ -106,6 +128,13 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { const readyToTranscribe = !model.isGenerating && model.isReady; const recordingButtonDisabled = isSimulator || !readyToTranscribe; + const hasResult = transcription.length > 0; + + const displayedText = hasResult + ? getText(transcription) + : getText(model.committedTranscription) + + getText(model.nonCommittedTranscription); + return ( @@ -126,6 +155,20 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { Status: {getModelStatus()} + + Enable Timestamps + { + setEnableTimestamps(val); + setTranscription(val ? [] : ''); + }} + trackColor={{ false: '#767577', true: '#0f186e' }} + thumbColor={enableTimestamps ? '#fff' : '#f4f3f4'} + disabled={model.isGenerating} + /> + + Transcription void }) => { scrollViewRef.current?.scrollToEnd({ animated: true }) } > - - {transcription !== '' - ? transcription - : model.committedTranscription + - model.nonCommittedTranscription} - + {displayedText} @@ -229,6 +267,17 @@ const styles = StyleSheet.create({ marginTop: 12, alignItems: 'center', }, + toggleContainer: { + flexDirection: 'row', + alignItems: 'center', + marginTop: 10, + marginBottom: 5, + }, + toggleLabel: { + fontSize: 16, + marginRight: 10, + color: '#0f186e', + }, transcriptionContainer: { flex: 1, width: '100%', diff --git a/docs/docs/02-hooks/01-natural-language-processing/useSpeechToText.md b/docs/docs/02-hooks/01-natural-language-processing/useSpeechToText.md index d94c96a66..b3171c77f 100644 --- a/docs/docs/02-hooks/01-natural-language-processing/useSpeechToText.md +++ b/docs/docs/02-hooks/01-natural-language-processing/useSpeechToText.md @@ -75,25 +75,31 @@ For more information on loading resources, take a look at [loading models](../.. ### Returns -| Field | Type | Description | -| --------------------------- | ---------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `transcribe` | `(waveform: Float32Array \| number[], options?: DecodingOptions \| undefined) => Promise` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. The second argument is an options object, e.g. `{ language: 'es' }` for multilingual models. Resolves a promise with the output transcription when the model is finished. Passing `number[]` is deprecated. | -| `stream` | `(options?: DecodingOptions \| undefined) => Promise` | Starts a streaming transcription process. Use in combination with `streamInsert` to feed audio chunks and `streamStop` to end the stream. The argument is an options object, e.g. `{ language: 'es' }` for multilingual models. Updates `committedTranscription` and `nonCommittedTranscription` as transcription progresses. | -| `streamInsert` | `(waveform: Float32Array \| number[]) => void` | Inserts a chunk of audio data (sampled at 16kHz) into the ongoing streaming transcription. Call this repeatedly as new audio data becomes available. Passing `number[]` is deprecated. | -| `streamStop` | `() => void` | Stops the ongoing streaming transcription process. | -| `encode` | `(waveform: Float32Array \| number[]) => Promise` | Runs the encoding part of the model on the provided waveform. Passing `number[]` is deprecated. | -| `decode` | `(tokens: number[] \| Int32Array, encoderOutput: Float32Array \| number[]) => Promise` | Runs the decoder of the model. Passing `number[]` is deprecated. | -| `committedTranscription` | `string` | Contains the part of the transcription that is finalized and will not change. Useful for displaying stable results during streaming. | -| `nonCommittedTranscription` | `string` | Contains the part of the transcription that is still being processed and may change. Useful for displaying live, partial results during streaming. | -| `error` | `string \| null` | Contains the error message if the model failed to load. | -| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | -| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | -| `downloadProgress` | `number` | Tracks the progress of the model download process. | +| Field | Type | Description | +| --------------------------- | ---------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `transcribe` | `(waveform: Float32Array | number[], options?: DecodingOptions & { enableTimestamps?: boolean }) => Promise` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. For multilingual models, specify the language in `options`, e.g. `{ language: 'es' }` for multilingual models. If `enableTimestamps` is true, returns transcription with timestamps (`Word[]>`). If `enableTimestamps` is false (default), returns transcription as a string. Resolves a promise with the output transcription when the model is finished. Passing `number[]` is deprecated. | +| `stream` | `(options?: DecodingOptions & { enableTimestamps?: boolean }) => Promise` | Starts a streaming transcription process. Use in combination with `streamInsert` to feed audio chunks and `streamStop` to end the stream. The argument is an options object, e.g. `{ language: 'es' }` for multilingual models. As in `transcribe`, you can decide either you want transcription with timestamps or not by setting `enableTimestamps`. Updates `committedTranscription` and `nonCommittedTranscription` as transcription progresses. | +| `streamInsert` | `(waveform: Float32Array \| number[]) => void` | Inserts a chunk of audio data (sampled at 16kHz) into the ongoing streaming transcription. Call this repeatedly as new audio data becomes available. Passing `number[]` is deprecated. | +| `streamStop` | `() => void` | Stops the ongoing streaming transcription process. | +| `encode` | `(waveform: Float32Array \| number[]) => Promise` | Runs the encoding part of the model on the provided waveform. Passing `number[]` is deprecated. | +| `decode` | `(tokens: number[] \| Int32Array, encoderOutput: Float32Array \| number[]) => Promise` | Runs the decoder of the model. Passing `number[]` is deprecated. | +| `committedTranscription` | `string` | Contains the part of the transcription that is finalized and will not change. Useful for displaying stable results during streaming. | +| `nonCommittedTranscription` | `string` | Contains the part of the transcription that is still being processed and may change. Useful for displaying live, partial results during streaming. | +| `error` | `string \| null` | Contains the error message if the model failed to load. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| `downloadProgress` | `number` | Tracks the progress of the model download process. |
Type definitions ```typescript +interface Word { + word: string; + start: number; + end: number; +} + // Languages supported by whisper (Multilingual) type SpeechToTextLanguage = | 'af' @@ -174,6 +180,7 @@ type SpeechToTextLanguage = interface DecodingOptions { language?: SpeechToTextLanguage; + enableTimestamps?: boolean; } interface SpeechToTextModelConfig { @@ -204,12 +211,25 @@ const model = useSpeechToText({ const transcription = await model.transcribe(spanishAudio, { language: 'es' }); ``` +### Timestamps + +You can obtain word-level timestamps by setting `enableTimestamps: true` in the options. This changes the return type from a string to an array of `Word` objects. + +```typescript +const words = await model.transcribe(audioBuffer, { enableTimestamps: true }); +// words: [{ word: "Hello", start: 0.0, end: 0.4 }, ...] +``` + ## Example ```tsx import React, { useState } from 'react'; -import { Button, Text } from 'react-native'; -import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch'; +import { Button, Text, View } from 'react-native'; +import { + useSpeechToText, + WHISPER_TINY_EN, + Word, +} from 'react-native-executorch'; import { AudioContext } from 'react-native-audio-api'; import * as FileSystem from 'expo-file-system'; @@ -218,7 +238,7 @@ function App() { model: WHISPER_TINY_EN, }); - const [transcription, setTranscription] = useState(''); + const [transcription, setTranscription] = useState(''); const loadAudio = async () => { const { uri } = await FileSystem.downloadAsync( @@ -235,14 +255,38 @@ function App() { const handleTranscribe = async () => { const audio = await loadAudio(); - await model.transcribe(audio); + // Default text transcription + const result = await model.transcribe(audio); + setTranscription(result); + }; + + const handleTranscribeWithTimestamps = async () => { + const audio = await loadAudio(); + // Transcription with timestamps + const result = await model.transcribe(audio, { enableTimestamps: true }); + setTranscription(result); + }; + + const renderContent = () => { + if (typeof transcription === 'string') { + return {transcription}; + } + return transcription.map((w, i) => ( + + {w.word} ({w.start.toFixed(2)}s) + + )); }; return ( - <> - {transcription} -