From 0ed673071ceeb51ee8994c78b049e43c2aecbe96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Tue, 20 Jan 2026 14:16:54 +0100 Subject: [PATCH 01/14] Draft of changes introducing timestamping --- .../host_objects/JsiConversions.h | 20 ++ .../models/speech_to_text/SpeechToText.cpp | 127 ++++++++-- .../models/speech_to_text/SpeechToText.h | 3 +- .../stream/OnlineASRProcessor.cpp | 32 +-- .../stream/OnlineASRProcessor.h | 5 +- .../useSpeechToText.ts | 177 +++++++++++++- .../SpeechToTextModule.ts | 228 ++++++++++++++++-- 7 files changed, 535 insertions(+), 57 deletions(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h index 97e8d91fb..ec6332209 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h @@ -62,6 +62,20 @@ getValue>(const jsi::Value &val, val.asObject(runtime).asFunction(runtime)); } +template <> +inline getValue(const jsi::Value &val, jsi::Runtime &runtime) { + jsi::Array jsiArr(rt, words.size()); + for (size_t i = 0; i < words.size(); ++i) { + jsi::Object obj(rt); + obj.setProperty(rt, "word", + jsi::String::createFromUtf8(rt, words[i].content)); + obj.setProperty(rt, "start", static_cast(words[i].start)); + obj.setProperty(rt, "end", static_cast(words[i].end)); + jsiArr.setValueAtIndex(rt, i, obj); + } + return jsiArr; +}; + template <> inline JSTensorViewIn getValue(const jsi::Value &val, jsi::Runtime &runtime) { @@ -218,6 +232,12 @@ getValue>(const jsi::Value &val, jsi::Runtime &runtime) { return getArrayAsVector(val, runtime); } +template <> +inline std::vector getValue>(const jsi::Value &val, + jsi::Runtime &runtime) { + return getArrayAsVector(val, runtime); +} + // Template specializations for std::span types template <> inline std::span getValue>(const jsi::Value &val, diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp index 3c81eb8e9..7f8ef81cf 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp @@ -66,11 +66,92 @@ std::vector SpeechToText::transcribe(std::span waveform, return {transcription.begin(), transcription.end()}; } +std::vector SpeechToText::transcribe(std::span waveform, + std::string languageOption) const { + std::vector segments = + this->asr->transcribe(waveform, DecodingOptions(languageOption)); + std::vector transcription; + + size_t transcriptionLength = 0; + for (auto &segment : segments) { + transcriptionLength += segment.words.size(); + } + + transcription.reserve(segments.size()); + + for (auto &segment : segments) { + for (auto &word : segment.words) { + transcription.push_back(word); + } + } + + auto wordsToJsi = [](jsi::Runtime &rt, + const std::vector &words) -> jsi::Value { + jsi::Array jsiArr(rt, words.size()); + for (size_t i = 0; i < words.size(); ++i) { + jsi::Object obj(rt); + obj.setProperty(rt, "word", + jsi::String::createFromUtf8(rt, words[i].content)); + obj.setProperty(rt, "start", static_cast(words[i].start)); + obj.setProperty(rt, "end", static_cast(words[i].end)); + jsiArr.setValueAtIndex(rt, i, obj); + } + return jsiArr; + }; + + return transcription; +} + size_t SpeechToText::getMemoryLowerBound() const noexcept { return this->encoder->getMemoryLowerBound() + this->decoder->getMemoryLowerBound(); } +// void SpeechToText::stream(std::shared_ptr callback, +// std::string languageOption) { +// if (this->isStreaming) { +// throw RnExecutorchError(RnExecutorchErrorCode::StreamingInProgress, +// "Streaming is already in progress!"); +// } + +// auto nativeCallback = +// [this, callback](const std::vector &committedVec, +// const std::vector &nonCommittedVec, bool isDone) +// { +// this->callInvoker->invokeAsync([callback, committedVec, +// nonCommittedVec, +// isDone](jsi::Runtime &rt) { +// callback->call( +// rt, rnexecutorch::jsi_conversion::getJsiValue(committedVec, +// rt), rnexecutorch::jsi_conversion::getJsiValue(nonCommittedVec, +// rt), jsi::Value(isDone)); +// }); +// }; + +// this->isStreaming = true; +// while (this->isStreaming) { +// if (!this->readyToProcess || +// this->processor->audioBuffer.size() < SpeechToText::kMinAudioSamples) +// { +// std::this_thread::sleep_for(std::chrono::milliseconds(100)); +// continue; +// } +// ProcessResult res = +// this->processor->processIter(DecodingOptions(languageOption)); + +// nativeCallback({res.committed.begin(), res.committed.end()}, +// {res.nonCommitted.begin(), res.nonCommitted.end()}, +// false); +// this->readyToProcess = false; +// } + +// std::string committed = this->processor->finish(); + +// nativeCallback({committed.begin(), committed.end()}, {}, true); + +// this->resetStreamState(); +// } + void SpeechToText::stream(std::shared_ptr callback, std::string languageOption) { if (this->isStreaming) { @@ -78,17 +159,33 @@ void SpeechToText::stream(std::shared_ptr callback, "Streaming is already in progress!"); } - auto nativeCallback = - [this, callback](const std::vector &committedVec, - const std::vector &nonCommittedVec, bool isDone) { - this->callInvoker->invokeAsync([callback, committedVec, nonCommittedVec, - isDone](jsi::Runtime &rt) { - callback->call( - rt, rnexecutorch::jsi_conversion::getJsiValue(committedVec, rt), - rnexecutorch::jsi_conversion::getJsiValue(nonCommittedVec, rt), - jsi::Value(isDone)); - }); - }; + auto wordsToJsi = [](jsi::Runtime &rt, + const std::vector &words) -> jsi::Value { + jsi::Array jsiArr(rt, words.size()); + for (size_t i = 0; i < words.size(); ++i) { + jsi::Object obj(rt); + obj.setProperty(rt, "word", + jsi::String::createFromUtf8(rt, words[i].content)); + obj.setProperty(rt, "start", static_cast(words[i].start)); + obj.setProperty(rt, "end", static_cast(words[i].end)); + jsiArr.setValueAtIndex(rt, i, obj); + } + return jsiArr; + }; + + auto nativeCallback = [this, callback, + wordsToJsi](const std::vector &committedVec, + const std::vector &nonCommittedVec, + bool isDone) { + this->callInvoker->invokeAsync([callback, committedVec, nonCommittedVec, + isDone, wordsToJsi](jsi::Runtime &rt) { + jsi::Value committedJsi = wordsToJsi(rt, committedVec); + jsi::Value nonCommittedJsi = wordsToJsi(rt, nonCommittedVec); + + callback->call(rt, std::move(committedJsi), std::move(nonCommittedJsi), + jsi::Value(isDone)); + }); + }; this->isStreaming = true; while (this->isStreaming) { @@ -100,14 +197,14 @@ void SpeechToText::stream(std::shared_ptr callback, ProcessResult res = this->processor->processIter(DecodingOptions(languageOption)); - nativeCallback({res.committed.begin(), res.committed.end()}, - {res.nonCommitted.begin(), res.nonCommitted.end()}, false); + nativeCallback(res.committed, res.nonCommitted, false); this->readyToProcess = false; } - std::string committed = this->processor->finish(); + // finish() now returns std::vector + std::vector committed = this->processor->finish(); - nativeCallback({committed.begin(), committed.end()}, {}, true); + nativeCallback(committed, {}, true); this->resetStreamState(); } diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h index d2111d378..af02a5357 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h @@ -23,7 +23,8 @@ class SpeechToText { [[nodiscard( "Registered non-void function")]] std::shared_ptr decode(std::span tokens, std::span encoderOutput) const; - [[nodiscard("Registered non-void function")]] std::vector + // [[nodiscard("Registered non-void function")]] std::vector + [[nodiscard("Registered non-void function")]] std::vector transcribe(std::span waveform, std::string languageOption) const; size_t getMemoryLowerBound() const noexcept; diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp index c6a99e9a2..b8a7aced4 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp @@ -77,23 +77,27 @@ void OnlineASRProcessor::chunkAt(float time) { this->bufferTimeOffset = time; } -std::string OnlineASRProcessor::finish() { - const std::deque buffer = this->hypothesisBuffer.complete(); - std::string committedText = this->toFlush(buffer); +std::vector OnlineASRProcessor::finish() { + std::deque bufferDeq = this->hypothesisBuffer.complete(); + std::vector buffer(std::make_move_iterator(bufferDeq.begin()), + std::make_move_iterator(bufferDeq.end())); + + // std::string committedText = this->toFlush(buffer); this->bufferTimeOffset += static_cast(audioBuffer.size()) / OnlineASRProcessor::kSamplingRate; - return committedText; + return buffer; } -std::string OnlineASRProcessor::toFlush(const std::deque &words) const { - std::string text; - text.reserve(std::accumulate( - words.cbegin(), words.cend(), 0, - [](size_t sum, const Word &w) { return sum + w.content.size(); })); - for (const auto &word : words) { - text.append(word.content); - } - return text; -} +// std::string OnlineASRProcessor::toFlush(const std::deque &words) const +// { +// std::string text; +// text.reserve(std::accumulate( +// words.cbegin(), words.cend(), 0, +// [](size_t sum, const Word &w) { return sum + w.content.size(); })); +// for (const auto &word : words) { +// text.append(word.content); +// } +// return text; +// } } // namespace rnexecutorch::models::speech_to_text::stream diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h index c50b56271..720e6bf76 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h @@ -12,7 +12,8 @@ class OnlineASRProcessor { void insertAudioChunk(std::span audio); types::ProcessResult processIter(const types::DecodingOptions &options); - std::string finish(); + // std::string finish(); + std::vector finish(); std::vector audioBuffer; @@ -27,7 +28,7 @@ class OnlineASRProcessor { void chunkCompletedSegment(std::span res); void chunkAt(float time); - std::string toFlush(const std::deque &words) const; + // std::string toFlush(const std::deque &words) const; }; } // namespace rnexecutorch::models::speech_to_text::stream diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts index 3e1324f54..6f22bf7b5 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts @@ -1,5 +1,147 @@ +// import { useEffect, useCallback, useState } from 'react'; +// import { SpeechToTextModule, Word } from '../../modules/natural_language_processing/SpeechToTextModule'; +// import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt'; +// import { RnExecutorchErrorCode } from '../../errors/ErrorCodes'; +// import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils'; + +// export const useSpeechToText = ({ +// model, +// preventLoad = false, +// }: { +// model: SpeechToTextModelConfig; +// preventLoad?: boolean; +// }) => { +// const [error, setError] = useState(null); +// const [isReady, setIsReady] = useState(false); +// const [isGenerating, setIsGenerating] = useState(false); +// const [downloadProgress, setDownloadProgress] = useState(0); + +// const [modelInstance] = useState(() => new SpeechToTextModule()); +// const [committedTranscription, setCommittedTranscription] = useState(Word); +// const [nonCommittedTranscription, setNonCommittedTranscription] = +// useState(Word); + +// useEffect(() => { +// if (preventLoad) return; +// (async () => { +// setDownloadProgress(0); +// setError(null); +// try { +// setIsReady(false); +// await modelInstance.load( +// { +// isMultilingual: model.isMultilingual, +// encoderSource: model.encoderSource, +// decoderSource: model.decoderSource, +// tokenizerSource: model.tokenizerSource, +// }, +// setDownloadProgress +// ); +// setIsReady(true); +// } catch (err) { +// setError(parseUnknownError(err)); +// } +// })(); +// }, [ +// modelInstance, +// model.isMultilingual, +// model.encoderSource, +// model.decoderSource, +// model.tokenizerSource, +// preventLoad, +// ]); + +// const stateWrapper = useCallback( +// Promise>(fn: T) => +// async (...args: Parameters): Promise>> => { +// if (!isReady) +// throw new RnExecutorchError( +// RnExecutorchErrorCode.ModuleNotLoaded, +// 'The model is currently not loaded. Please load the model before calling this function.' +// ); +// if (isGenerating) +// throw new RnExecutorchError( +// RnExecutorchErrorCode.ModelGenerating, +// 'The model is currently generating. Please wait until previous model run is complete.' +// ); +// setIsGenerating(true); +// try { +// return await fn.apply(modelInstance, args); +// } finally { +// setIsGenerating(false); +// } +// }, +// [isReady, isGenerating, modelInstance] +// ); + +// const stream = useCallback( +// async (options?: DecodingOptions) => { +// if (!isReady) +// throw new RnExecutorchError( +// RnExecutorchErrorCode.ModuleNotLoaded, +// 'The model is currently not loaded. Please load the model before calling this function.' +// ); +// if (isGenerating) +// throw new RnExecutorchError( +// RnExecutorchErrorCode.ModelGenerating, +// 'The model is currently generating. Please wait until previous model run is complete.' +// ); +// setIsGenerating(true); +// setCommittedTranscription(''); +// setNonCommittedTranscription(''); +// let transcription = ''; +// try { +// for await (const { committed, nonCommitted } of modelInstance.stream( +// options +// )) { +// setCommittedTranscription((prev) => prev + committed); +// setNonCommittedTranscription(nonCommitted); +// transcription += committed; +// } +// } finally { +// setIsGenerating(false); +// } +// return transcription; +// }, +// [isReady, isGenerating, modelInstance] +// ); + +// const wrapper = useCallback( +// any>(fn: T) => { +// return (...args: Parameters): ReturnType => { +// if (!isReady) +// throw new RnExecutorchError( +// RnExecutorchErrorCode.ModuleNotLoaded, +// 'The model is currently not loaded. Please load the model before calling this function.' +// ); +// return fn.apply(modelInstance, args); +// }; +// }, +// [isReady, modelInstance] +// ); + +// return { +// error, +// isReady, +// isGenerating, +// downloadProgress, +// committedTranscription, +// nonCommittedTranscription, +// encode: stateWrapper(SpeechToTextModule.prototype.encode), +// decode: stateWrapper(SpeechToTextModule.prototype.decode), +// transcribe: stateWrapper(SpeechToTextModule.prototype.transcribe), +// stream, +// streamStop: wrapper(SpeechToTextModule.prototype.streamStop), +// streamInsert: wrapper(SpeechToTextModule.prototype.streamInsert), +// }; +// }; + import { useEffect, useCallback, useState } from 'react'; -import { SpeechToTextModule } from '../../modules/natural_language_processing/SpeechToTextModule'; +// Make sure Word is exported from your module file +import { + SpeechToTextModule, + Word, +} from '../../modules/natural_language_processing/SpeechToTextModule'; import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt'; import { RnExecutorchErrorCode } from '../../errors/ErrorCodes'; import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils'; @@ -17,9 +159,14 @@ export const useSpeechToText = ({ const [downloadProgress, setDownloadProgress] = useState(0); const [modelInstance] = useState(() => new SpeechToTextModule()); - const [committedTranscription, setCommittedTranscription] = useState(''); - const [nonCommittedTranscription, setNonCommittedTranscription] = - useState(''); + + // FIX 1: Initialize with empty array [], generic type Word[] + const [committedTranscription, setCommittedTranscription] = useState( + [] + ); + const [nonCommittedTranscription, setNonCommittedTranscription] = useState< + Word[] + >([]); useEffect(() => { if (preventLoad) return; @@ -87,21 +234,31 @@ export const useSpeechToText = ({ 'The model is currently generating. Please wait until previous model run is complete.' ); setIsGenerating(true); - setCommittedTranscription(''); - setNonCommittedTranscription(''); - let transcription = ''; + + // FIX 2: Reset to empty arrays + setCommittedTranscription([]); + setNonCommittedTranscription([]); + + // Accumulator is now an array of Words, not a string + const fullResult: Word[] = []; + try { for await (const { committed, nonCommitted } of modelInstance.stream( options )) { - setCommittedTranscription((prev) => prev + committed); + // FIX 3: Update state by appending arrays + if (committed.length > 0) { + setCommittedTranscription((prev) => [...prev, ...committed]); + fullResult.push(...committed); + } + + // nonCommitted is always a fresh partial chunk setNonCommittedTranscription(nonCommitted); - transcription += committed; } } finally { setIsGenerating(false); } - return transcription; + return fullResult; }, [isReady, isGenerating, modelInstance] ); diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts index 9619547c8..e0ca88251 100644 --- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts +++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts @@ -1,18 +1,212 @@ +// import { Logger } from '../../common/Logger'; +// import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt'; +// import { ResourceFetcher } from '../../utils/ResourceFetcher'; +// import { RnExecutorchErrorCode } from '../../errors/ErrorCodes'; +// import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils'; + +// export class SpeechToTextModule { +// private nativeModule: any; + +// private modelConfig!: SpeechToTextModelConfig; + +// private textDecoder = new TextDecoder('utf-8', { +// fatal: false, +// ignoreBOM: true, +// }); + +// public async load( +// model: SpeechToTextModelConfig, +// onDownloadProgressCallback: (progress: number) => void = () => {} +// ) { +// this.modelConfig = model; + +// const tokenizerLoadPromise = ResourceFetcher.fetch( +// undefined, +// model.tokenizerSource +// ); +// const encoderDecoderPromise = ResourceFetcher.fetch( +// onDownloadProgressCallback, +// model.encoderSource, +// model.decoderSource +// ); +// const [tokenizerSources, encoderDecoderResults] = await Promise.all([ +// tokenizerLoadPromise, +// encoderDecoderPromise, +// ]); +// const encoderSource = encoderDecoderResults?.[0]; +// const decoderSource = encoderDecoderResults?.[1]; +// if (!encoderSource || !decoderSource || !tokenizerSources) { +// throw new RnExecutorchError( +// RnExecutorchErrorCode.DownloadInterrupted, +// 'The download has been interrupted. As a result, not every file was downloaded. Please retry the download.' +// ); +// } +// this.nativeModule = await global.loadSpeechToText( +// encoderSource, +// decoderSource, +// tokenizerSources[0]! +// ); +// } + +// public delete(): void { +// this.nativeModule.unload(); +// } + +// public async encode( +// waveform: Float32Array | number[] +// ): Promise { +// if (Array.isArray(waveform)) { +// Logger.info( +// 'Passing waveform as number[] is deprecated, use Float32Array instead' +// ); +// waveform = new Float32Array(waveform); +// } +// return new Float32Array(await this.nativeModule.encode(waveform)); +// } + +// public async decode( +// tokens: Int32Array | number[], +// encoderOutput: Float32Array | number[] +// ): Promise { +// if (Array.isArray(tokens)) { +// Logger.info( +// 'Passing tokens as number[] is deprecated, use Int32Array instead' +// ); +// tokens = new Int32Array(tokens); +// } +// if (Array.isArray(encoderOutput)) { +// Logger.info( +// 'Passing encoderOutput as number[] is deprecated, use Float32Array instead' +// ); +// encoderOutput = new Float32Array(encoderOutput); +// } +// return new Float32Array( +// await this.nativeModule.decode(tokens, encoderOutput) +// ); +// } + +// public async transcribe( +// waveform: Float32Array | number[], +// options: DecodingOptions = {} +// ): Promise { +// this.validateOptions(options); + +// if (Array.isArray(waveform)) { +// Logger.info( +// 'Passing waveform as number[] is deprecated, use Float32Array instead' +// ); +// waveform = new Float32Array(waveform); +// } +// const transcriptionBytes = await this.nativeModule.transcribe( +// waveform, +// options.language || '' +// ); +// return this.textDecoder.decode(new Uint8Array(transcriptionBytes)); +// } + +// public async *stream( +// options: DecodingOptions = {} +// ): AsyncGenerator<{ committed: string; nonCommitted: string }> { +// this.validateOptions(options); + +// const queue: { committed: string; nonCommitted: string }[] = []; +// let waiter: (() => void) | null = null; +// let finished = false; +// let error: unknown; + +// const wake = () => { +// waiter?.(); +// waiter = null; +// }; + +// (async () => { +// try { +// await this.nativeModule.stream( +// (committed: number[], nonCommitted: number[], isDone: boolean) => { +// queue.push({ +// committed: this.textDecoder.decode(new Uint8Array(committed)), +// nonCommitted: this.textDecoder.decode( +// new Uint8Array(nonCommitted) +// ), +// }); +// if (isDone) { +// finished = true; +// } +// wake(); +// }, +// options.language || '' +// ); +// finished = true; +// wake(); +// } catch (e) { +// error = e; +// finished = true; +// wake(); +// } +// })(); + +// while (true) { +// if (queue.length > 0) { +// yield queue.shift()!; +// if (finished && queue.length === 0) { +// return; +// } +// continue; +// } +// if (error) throw parseUnknownError(error); +// if (finished) return; +// await new Promise((r) => (waiter = r)); +// } +// } + +// public streamInsert(waveform: Float32Array | number[]): void { +// if (Array.isArray(waveform)) { +// Logger.info( +// 'Passing waveform as number[] is deprecated, use Float32Array instead' +// ); +// waveform = new Float32Array(waveform); +// } +// this.nativeModule.streamInsert(waveform); +// } + +// public streamStop(): void { +// this.nativeModule.streamStop(); +// } + +// private validateOptions(options: DecodingOptions) { +// if (!this.modelConfig.isMultilingual && options.language) { +// throw new RnExecutorchError( +// RnExecutorchErrorCode.InvalidConfig, +// 'Model is not multilingual, cannot set language' +// ); +// } +// if (this.modelConfig.isMultilingual && !options.language) { +// throw new RnExecutorchError( +// RnExecutorchErrorCode.InvalidConfig, +// 'Model is multilingual, provide a language' +// ); +// } +// } +// } + import { Logger } from '../../common/Logger'; import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt'; import { ResourceFetcher } from '../../utils/ResourceFetcher'; import { RnExecutorchErrorCode } from '../../errors/ErrorCodes'; import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils'; +// 1. Define the Word interface matching your C++ JSI object structure +export interface Word { + word: string; + start: number; + end: number; +} + export class SpeechToTextModule { private nativeModule: any; - private modelConfig!: SpeechToTextModelConfig; - private textDecoder = new TextDecoder('utf-8', { - fatal: false, - ignoreBOM: true, - }); + // 2. TextDecoder is removed as C++ now returns JS objects directly public async load( model: SpeechToTextModelConfig, @@ -85,10 +279,11 @@ export class SpeechToTextModule { ); } + // 3. Update transcribe to return Word[] instead of string public async transcribe( waveform: Float32Array | number[], options: DecodingOptions = {} - ): Promise { + ): Promise { this.validateOptions(options); if (Array.isArray(waveform)) { @@ -97,19 +292,23 @@ export class SpeechToTextModule { ); waveform = new Float32Array(waveform); } - const transcriptionBytes = await this.nativeModule.transcribe( + + // The native module now returns an Array of Objects, not bytes + const transcription: Word[] = await this.nativeModule.transcribe( waveform, options.language || '' ); - return this.textDecoder.decode(new Uint8Array(transcriptionBytes)); + + return transcription; } + // 4. Update stream to yield Word[] structure public async *stream( options: DecodingOptions = {} - ): AsyncGenerator<{ committed: string; nonCommitted: string }> { + ): AsyncGenerator<{ committed: Word[]; nonCommitted: Word[] }> { this.validateOptions(options); - const queue: { committed: string; nonCommitted: string }[] = []; + const queue: { committed: Word[]; nonCommitted: Word[] }[] = []; let waiter: (() => void) | null = null; let finished = false; let error: unknown; @@ -122,12 +321,11 @@ export class SpeechToTextModule { (async () => { try { await this.nativeModule.stream( - (committed: number[], nonCommitted: number[], isDone: boolean) => { + // Callback now receives arrays of objects directly + (committed: Word[], nonCommitted: Word[], isDone: boolean) => { queue.push({ - committed: this.textDecoder.decode(new Uint8Array(committed)), - nonCommitted: this.textDecoder.decode( - new Uint8Array(nonCommitted) - ), + committed, + nonCommitted, }); if (isDone) { finished = true; From b75204e25d6ba1864518e0d221ce99ddda86d6ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Tue, 20 Jan 2026 14:44:24 +0100 Subject: [PATCH 02/14] Add missing headers --- .../common/rnexecutorch/host_objects/JsiConversions.h | 1 + .../common/rnexecutorch/host_objects/ModelHostObject.h | 1 + 2 files changed, 2 insertions(+) diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h index ec6332209..f43d14180 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h @@ -18,6 +18,7 @@ #include #include #include +#include #include namespace rnexecutorch::jsi_conversion { diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h index c8232fe8c..fc96965ec 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include From 3c72c17341a6173142324bd2e19b7cd50e225c05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Tue, 20 Jan 2026 21:59:21 +0100 Subject: [PATCH 03/14] Add draft of working version for timestamps only --- apps/speech/screens/SpeechToTextScreen.tsx | 335 +++++++++++++++++- .../host_objects/JsiConversions.h | 49 ++- .../host_objects/ModelHostObject.h | 2 + .../models/speech_to_text/SpeechToText.cpp | 40 +-- .../stream/OnlineASRProcessor.cpp | 6 +- .../speech_to_text/types/ProcessResult.h | 9 +- 6 files changed, 397 insertions(+), 44 deletions(-) diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index 1e4525986..ad78dcb49 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -1,3 +1,300 @@ +// import React, { useEffect, useRef, useState } from 'react'; +// import { +// Text, +// View, +// StyleSheet, +// TouchableOpacity, +// ScrollView, +// TextInput, +// KeyboardAvoidingView, +// Platform, +// } from 'react-native'; +// import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; +// import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch'; +// import FontAwesome from '@expo/vector-icons/FontAwesome'; +// import { +// AudioManager, +// AudioRecorder, +// AudioContext, +// } from 'react-native-audio-api'; +// import * as FileSystem from 'expo-file-system/legacy'; +// import SWMIcon from '../assets/swm_icon.svg'; +// import DeviceInfo from 'react-native-device-info'; + +// const isSimulator = DeviceInfo.isEmulatorSync(); + +// export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { +// const model = useSpeechToText({ +// model: WHISPER_TINY_EN, +// }); + +// const [transcription, setTranscription] = useState(''); +// const [audioURL, setAudioURL] = useState(''); +// const [liveTranscribing, setLiveTranscribing] = useState(false); +// const scrollViewRef = useRef(null); + +// const [recorder] = useState( +// () => +// new AudioRecorder({ +// sampleRate: 16000, +// bufferLengthInSamples: 1600, +// }) +// ); + +// useEffect(() => { +// AudioManager.setAudioSessionOptions({ +// iosCategory: 'playAndRecord', +// iosMode: 'spokenAudio', +// iosOptions: ['allowBluetooth', 'defaultToSpeaker'], +// }); +// AudioManager.requestRecordingPermissions(); +// }, []); + +// const handleTranscribeFromURL = async () => { +// if (!audioURL.trim()) { +// console.warn('Please provide a valid audio file URL'); +// return; +// } + +// const { uri } = await FileSystem.downloadAsync( +// audioURL, +// FileSystem.cacheDirectory + 'audio_file' +// ); + +// const audioContext = new AudioContext({ sampleRate: 16000 }); + +// try { +// const decodedAudioData = await audioContext.decodeAudioDataSource(uri); +// const audioBuffer = decodedAudioData.getChannelData(0); +// setTranscription(await model.transcribe(audioBuffer)); +// } catch (error) { +// console.error('Error decoding audio data', error); +// console.warn('Note: Supported file formats: mp3, wav, flac'); +// return; +// } +// }; + +// const handleStartTranscribeFromMicrophone = async () => { +// setLiveTranscribing(true); +// setTranscription(''); +// recorder.onAudioReady(({ buffer }) => { +// model.streamInsert(buffer.getChannelData(0)); +// }); +// recorder.start(); + +// try { +// await model.stream(); +// } catch (error) { +// console.error('Error during live transcription:', error); +// } +// }; + +// const handleStopTranscribeFromMicrophone = () => { +// recorder.stop(); +// model.streamStop(); +// console.log('Live transcription stopped'); +// setLiveTranscribing(false); +// }; + +// const getModelStatus = () => { +// if (model.error) return `${model.error}`; +// if (model.isGenerating) return 'Transcribing...'; +// if (model.isReady) return 'Ready to transcribe'; +// return `Loading model: ${(100 * model.downloadProgress).toFixed(2)}%`; +// }; + +// const readyToTranscribe = !model.isGenerating && model.isReady; +// const recordingButtonDisabled = isSimulator || !readyToTranscribe; + +// return ( +// +// +// +// +// +// +// +// +// React Native ExecuTorch +// Speech to Text +// + +// +// Status: {getModelStatus()} +// + +// +// Transcription +// +// scrollViewRef.current?.scrollToEnd({ animated: true }) +// } +// > +// +// {transcription !== '' +// ? transcription +// : model.committedTranscription + +// model.nonCommittedTranscription} +// +// +// + +// +// +// +// +// Start +// +// + +// {liveTranscribing ? ( +// +// +// Stop Live Transcription +// +// ) : ( +// +// +// +// {isSimulator +// ? 'Recording is not available on Simulator' +// : 'Start Live Transcription'} +// +// +// )} +// +// +// +// +// ); +// }; + +// const styles = StyleSheet.create({ +// container: { +// flex: 1, +// alignItems: 'center', +// backgroundColor: 'white', +// paddingHorizontal: 16, +// }, +// keyboardAvoidingView: { +// flex: 1, +// width: '100%', +// }, +// header: { +// alignItems: 'center', +// position: 'relative', +// width: '100%', +// }, +// backButton: { +// position: 'absolute', +// left: 0, +// top: 10, +// padding: 10, +// zIndex: 1, +// }, +// headerText: { +// fontSize: 22, +// fontWeight: 'bold', +// color: '#0f186e', +// }, +// statusContainer: { +// marginTop: 12, +// alignItems: 'center', +// }, +// transcriptionContainer: { +// flex: 1, +// width: '100%', +// marginVertical: 12, +// }, +// transcriptionLabel: { +// marginLeft: 12, +// marginBottom: 4, +// color: '#0f186e', +// }, +// transcriptionScrollContainer: { +// borderRadius: 12, +// borderWidth: 1, +// borderColor: '#0f186e', +// padding: 12, +// }, +// inputContainer: { +// marginBottom: 12, +// }, +// urlTranscriptionContainer: { +// width: '100%', +// flexDirection: 'row', +// }, +// urlTranscriptionInput: { +// flex: 1, +// padding: 12, +// borderTopLeftRadius: 12, +// borderBottomLeftRadius: 12, +// borderWidth: 1, +// borderColor: '#0f186e', +// borderRightWidth: 0, +// }, +// urlTranscriptionButton: { +// backgroundColor: '#0f186e', +// justifyContent: 'center', +// alignItems: 'center', +// padding: 12, +// borderTopRightRadius: 12, +// borderBottomRightRadius: 12, +// }, +// buttonText: { +// color: 'white', +// fontWeight: '600', +// letterSpacing: -0.5, +// fontSize: 16, +// }, +// liveTranscriptionButton: { +// flexDirection: 'row', +// justifyContent: 'center', +// alignItems: 'center', +// padding: 12, +// borderRadius: 12, +// marginTop: 12, +// gap: 8, +// }, +// backgroundRed: { +// backgroundColor: 'red', +// }, +// backgroundBlue: { +// backgroundColor: '#0f186e', +// }, +// disabled: { +// opacity: 0.5, +// }, +// }); + import React, { useEffect, useRef, useState } from 'react'; import { Text, @@ -10,7 +307,12 @@ import { Platform, } from 'react-native'; import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; -import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch'; +import { + useSpeechToText, + WHISPER_TINY_EN, + // Make sure Word is exported from your module + Word, +} from 'react-native-executorch'; import FontAwesome from '@expo/vector-icons/FontAwesome'; import { AudioManager, @@ -28,7 +330,9 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { model: WHISPER_TINY_EN, }); - const [transcription, setTranscription] = useState(''); + // CHANGE 1: Update state to hold Word[] instead of string + const [transcription, setTranscription] = useState([]); + const [audioURL, setAudioURL] = useState(''); const [liveTranscribing, setLiveTranscribing] = useState(false); const scrollViewRef = useRef(null); @@ -50,6 +354,16 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { AudioManager.requestRecordingPermissions(); }, []); + const getText = (words: Word[]) => { + return words + .map((w) => { + // Format: "hello (0.00s - 0.50s) " + // using toFixed(2) for cleaner timestamp display + return `${w.word} (${w.start.toFixed(2)}s - ${w.end.toFixed(2)}s)\n`; + }) + .join(''); + }; + const handleTranscribeFromURL = async () => { if (!audioURL.trim()) { console.warn('Please provide a valid audio file URL'); @@ -66,6 +380,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { try { const decodedAudioData = await audioContext.decodeAudioDataSource(uri); const audioBuffer = decodedAudioData.getChannelData(0); + // model.transcribe now returns Word[], which matches our state type setTranscription(await model.transcribe(audioBuffer)); } catch (error) { console.error('Error decoding audio data', error); @@ -76,7 +391,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { const handleStartTranscribeFromMicrophone = async () => { setLiveTranscribing(true); - setTranscription(''); + setTranscription([]); // Reset to empty array recorder.onAudioReady(({ buffer }) => { model.streamInsert(buffer.getChannelData(0)); }); @@ -106,6 +421,13 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { const readyToTranscribe = !model.isGenerating && model.isReady; const recordingButtonDisabled = isSimulator || !readyToTranscribe; + // CHANGE 3: Prepare the text for rendering + const displayedText = + transcription.length > 0 + ? getText(transcription) + : getText(model.committedTranscription) + + getText(model.nonCommittedTranscription); + return ( @@ -135,12 +457,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { scrollViewRef.current?.scrollToEnd({ animated: true }) } > - - {transcription !== '' - ? transcription - : model.committedTranscription + - model.nonCommittedTranscription} - + {displayedText} diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h index f43d14180..df7f635d9 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h @@ -21,6 +21,8 @@ #include #include +using rnexecutorch::models::speech_to_text::types::Word; + namespace rnexecutorch::jsi_conversion { using namespace facebook; @@ -64,18 +66,24 @@ getValue>(const jsi::Value &val, } template <> -inline getValue(const jsi::Value &val, jsi::Runtime &runtime) { - jsi::Array jsiArr(rt, words.size()); - for (size_t i = 0; i < words.size(); ++i) { - jsi::Object obj(rt); - obj.setProperty(rt, "word", - jsi::String::createFromUtf8(rt, words[i].content)); - obj.setProperty(rt, "start", static_cast(words[i].start)); - obj.setProperty(rt, "end", static_cast(words[i].end)); - jsiArr.setValueAtIndex(rt, i, obj); - } - return jsiArr; -}; +inline Word getValue(const jsi::Value &val, jsi::Runtime &runtime) { + jsi::Object obj = val.asObject(runtime); + + // 1. Extract the string "word" using the existing string helper + std::string content = getValue(obj.getProperty(runtime, "word"), runtime); + + // 2. Extract start/end times + // We use .asNumber() directly as these are primitives + double start = obj.getProperty(runtime, "start").asNumber(); + double end = obj.getProperty(runtime, "end").asNumber(); + + // 3. Construct and return the C++ Word struct + return Word{ + .content = std::move(content), + .start = static_cast(start), + .end = static_cast(end) + }; +} template <> inline JSTensorViewIn getValue(const jsi::Value &val, @@ -305,6 +313,23 @@ inline jsi::Value getJsiValue(std::shared_ptr valuePtr, return std::move(*valuePtr); } +inline jsi::Value getJsiValue(const Word &word, jsi::Runtime &runtime) { + jsi::Object obj(runtime); + obj.setProperty(runtime, "word", jsi::String::createFromUtf8(runtime, word.content)); + obj.setProperty(runtime, "start", static_cast(word.start)); + obj.setProperty(runtime, "end", static_cast(word.end)); + return obj; +} + +inline jsi::Value getJsiValue(const std::vector &vec, jsi::Runtime &runtime) { + jsi::Array array(runtime, vec.size()); + for (size_t i = 0; i < vec.size(); ++i) { + // Convert each Word using the helper above and place in array + array.setValueAtIndex(runtime, i, getJsiValue(vec[i], runtime)); + } + return {runtime, array}; +} + inline jsi::Value getJsiValue(const std::vector &vec, jsi::Runtime &runtime) { jsi::Array array(runtime, vec.size()); diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h index fc96965ec..c71a58e41 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h @@ -26,6 +26,8 @@ #include #include +using rnexecutorch::models::speech_to_text::types::Word; + namespace rnexecutorch { template class ModelHostObject : public JsiHostObject { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp index 7f8ef81cf..a97edfcb9 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp @@ -43,28 +43,28 @@ SpeechToText::decode(std::span tokens, return std::make_shared(decoderOutput); } -std::vector SpeechToText::transcribe(std::span waveform, - std::string languageOption) const { - std::vector segments = - this->asr->transcribe(waveform, DecodingOptions(languageOption)); - std::string transcription; - - size_t transcriptionLength = 0; - for (auto &segment : segments) { - for (auto &word : segment.words) { - transcriptionLength += word.content.size(); - } - } - transcription.reserve(transcriptionLength); +// std::vector SpeechToText::transcribe(std::span waveform, +// std::string languageOption) const { +// std::vector segments = +// this->asr->transcribe(waveform, DecodingOptions(languageOption)); +// std::string transcription; + +// size_t transcriptionLength = 0; +// for (auto &segment : segments) { +// for (auto &word : segment.words) { +// transcriptionLength += word.content.size(); +// } +// } +// transcription.reserve(transcriptionLength); - for (auto &segment : segments) { - for (auto &word : segment.words) { - transcription += word.content; - } - } +// for (auto &segment : segments) { +// for (auto &word : segment.words) { +// transcription += word.content; +// } +// } - return {transcription.begin(), transcription.end()}; -} +// return {transcription.begin(), transcription.end()}; +// } std::vector SpeechToText::transcribe(std::span waveform, std::string languageOption) const { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp index b8a7aced4..f62986b72 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp @@ -35,7 +35,11 @@ ProcessResult OnlineASRProcessor::processIter(const DecodingOptions &options) { } std::deque nonCommittedWords = this->hypothesisBuffer.complete(); - return {this->toFlush(flushed), this->toFlush(nonCommittedWords)}; + // return {this->toFlush(flushed), this->toFlush(nonCommittedWords)}; + return {std::vector(std::make_move_iterator(flushed.begin()), + std::make_move_iterator(flushed.end())), + std::vector(std::make_move_iterator(nonCommittedWords.begin()), + std::make_move_iterator(nonCommittedWords.end()))}; } void OnlineASRProcessor::chunkCompletedSegment(std::span res) { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h index 0cb05e5a6..685ba2b76 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h @@ -4,9 +4,14 @@ namespace rnexecutorch::models::speech_to_text::types { +// struct ProcessResult { +// std::string committed; +// std::string nonCommitted; +// }; + struct ProcessResult { - std::string committed; - std::string nonCommitted; + std::vector committed; + std::vector nonCommitted; }; } // namespace rnexecutorch::models::speech_to_text::types From c0218bfb8de675935be934bfcbca340ccc715293 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Wed, 21 Jan 2026 15:36:31 +0100 Subject: [PATCH 04/14] Working version of both timestamping and regular version --- apps/speech/screens/SpeechToTextScreen.tsx | 414 +++++++++++++++++- .../host_objects/ModelHostObject.h | 5 + .../models/speech_to_text/SpeechToText.cpp | 109 +++-- .../models/speech_to_text/SpeechToText.h | 6 +- .../useSpeechToText.ts | 243 ++++++++-- .../SpeechToTextModule.ts | 290 ++++-------- .../react-native-executorch/src/types/stt.ts | 1 + 7 files changed, 755 insertions(+), 313 deletions(-) diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index ad78dcb49..9dab4420b 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -295,6 +295,323 @@ // }, // }); +// import React, { useEffect, useRef, useState } from 'react'; +// import { +// Text, +// View, +// StyleSheet, +// TouchableOpacity, +// ScrollView, +// TextInput, +// KeyboardAvoidingView, +// Platform, +// } from 'react-native'; +// import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; +// import { +// useSpeechToText, +// WHISPER_TINY_EN, +// // Make sure Word is exported from your module +// Word, +// } from 'react-native-executorch'; +// import FontAwesome from '@expo/vector-icons/FontAwesome'; +// import { +// AudioManager, +// AudioRecorder, +// AudioContext, +// } from 'react-native-audio-api'; +// import * as FileSystem from 'expo-file-system/legacy'; +// import SWMIcon from '../assets/swm_icon.svg'; +// import DeviceInfo from 'react-native-device-info'; + +// const isSimulator = DeviceInfo.isEmulatorSync(); + +// export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { +// const model = useSpeechToText({ +// model: WHISPER_TINY_EN, +// }); + +// // CHANGE 1: Update state to hold Word[] instead of string +// const [transcription, setTranscription] = useState([]); + +// const [audioURL, setAudioURL] = useState(''); +// const [liveTranscribing, setLiveTranscribing] = useState(false); +// const scrollViewRef = useRef(null); + +// const [recorder] = useState( +// () => +// new AudioRecorder({ +// sampleRate: 16000, +// bufferLengthInSamples: 1600, +// }) +// ); + +// useEffect(() => { +// AudioManager.setAudioSessionOptions({ +// iosCategory: 'playAndRecord', +// iosMode: 'spokenAudio', +// iosOptions: ['allowBluetooth', 'defaultToSpeaker'], +// }); +// AudioManager.requestRecordingPermissions(); +// }, []); + +// const getText = (words: Word[]) => { +// return words +// .map((w) => { +// // Format: "hello (0.00s - 0.50s) " +// // using toFixed(2) for cleaner timestamp display +// return `${w.word} (${w.start.toFixed(2)}s - ${w.end.toFixed(2)}s)\n`; +// }) +// .join(''); +// }; + +// const handleTranscribeFromURL = async () => { +// if (!audioURL.trim()) { +// console.warn('Please provide a valid audio file URL'); +// return; +// } + +// const { uri } = await FileSystem.downloadAsync( +// audioURL, +// FileSystem.cacheDirectory + 'audio_file' +// ); + +// const audioContext = new AudioContext({ sampleRate: 16000 }); + +// try { +// const decodedAudioData = await audioContext.decodeAudioDataSource(uri); +// const audioBuffer = decodedAudioData.getChannelData(0); +// // model.transcribe now returns Word[], which matches our state type +// setTranscription(await model.transcribe(audioBuffer)); +// } catch (error) { +// console.error('Error decoding audio data', error); +// console.warn('Note: Supported file formats: mp3, wav, flac'); +// return; +// } +// }; + +// const handleStartTranscribeFromMicrophone = async () => { +// setLiveTranscribing(true); +// setTranscription([]); // Reset to empty array +// recorder.onAudioReady(({ buffer }) => { +// model.streamInsert(buffer.getChannelData(0)); +// }); +// recorder.start(); + +// try { +// await model.stream(); +// } catch (error) { +// console.error('Error during live transcription:', error); +// } +// }; + +// const handleStopTranscribeFromMicrophone = () => { +// recorder.stop(); +// model.streamStop(); +// console.log('Live transcription stopped'); +// setLiveTranscribing(false); +// }; + +// const getModelStatus = () => { +// if (model.error) return `${model.error}`; +// if (model.isGenerating) return 'Transcribing...'; +// if (model.isReady) return 'Ready to transcribe'; +// return `Loading model: ${(100 * model.downloadProgress).toFixed(2)}%`; +// }; + +// const readyToTranscribe = !model.isGenerating && model.isReady; +// const recordingButtonDisabled = isSimulator || !readyToTranscribe; + +// // CHANGE 3: Prepare the text for rendering +// const displayedText = +// transcription.length > 0 +// ? getText(transcription) +// : getText(model.committedTranscription) + +// getText(model.nonCommittedTranscription); + +// return ( +// +// +// +// +// +// +// +// +// React Native ExecuTorch +// Speech to Text +// + +// +// Status: {getModelStatus()} +// + +// +// Transcription +// +// scrollViewRef.current?.scrollToEnd({ animated: true }) +// } +// > +// {displayedText} +// +// + +// +// +// +// +// Start +// +// + +// {liveTranscribing ? ( +// +// +// Stop Live Transcription +// +// ) : ( +// +// +// +// {isSimulator +// ? 'Recording is not available on Simulator' +// : 'Start Live Transcription'} +// +// +// )} +// +// +// +// +// ); +// }; + +// const styles = StyleSheet.create({ +// container: { +// flex: 1, +// alignItems: 'center', +// backgroundColor: 'white', +// paddingHorizontal: 16, +// }, +// keyboardAvoidingView: { +// flex: 1, +// width: '100%', +// }, +// header: { +// alignItems: 'center', +// position: 'relative', +// width: '100%', +// }, +// backButton: { +// position: 'absolute', +// left: 0, +// top: 10, +// padding: 10, +// zIndex: 1, +// }, +// headerText: { +// fontSize: 22, +// fontWeight: 'bold', +// color: '#0f186e', +// }, +// statusContainer: { +// marginTop: 12, +// alignItems: 'center', +// }, +// transcriptionContainer: { +// flex: 1, +// width: '100%', +// marginVertical: 12, +// }, +// transcriptionLabel: { +// marginLeft: 12, +// marginBottom: 4, +// color: '#0f186e', +// }, +// transcriptionScrollContainer: { +// borderRadius: 12, +// borderWidth: 1, +// borderColor: '#0f186e', +// padding: 12, +// }, +// inputContainer: { +// marginBottom: 12, +// }, +// urlTranscriptionContainer: { +// width: '100%', +// flexDirection: 'row', +// }, +// urlTranscriptionInput: { +// flex: 1, +// padding: 12, +// borderTopLeftRadius: 12, +// borderBottomLeftRadius: 12, +// borderWidth: 1, +// borderColor: '#0f186e', +// borderRightWidth: 0, +// }, +// urlTranscriptionButton: { +// backgroundColor: '#0f186e', +// justifyContent: 'center', +// alignItems: 'center', +// padding: 12, +// borderTopRightRadius: 12, +// borderBottomRightRadius: 12, +// }, +// buttonText: { +// color: 'white', +// fontWeight: '600', +// letterSpacing: -0.5, +// fontSize: 16, +// }, +// liveTranscriptionButton: { +// flexDirection: 'row', +// justifyContent: 'center', +// alignItems: 'center', +// padding: 12, +// borderRadius: 12, +// marginTop: 12, +// gap: 8, +// }, +// backgroundRed: { +// backgroundColor: 'red', +// }, +// backgroundBlue: { +// backgroundColor: '#0f186e', +// }, +// disabled: { +// opacity: 0.5, +// }, +// }); + import React, { useEffect, useRef, useState } from 'react'; import { Text, @@ -305,12 +622,12 @@ import { TextInput, KeyboardAvoidingView, Platform, + Switch, // Import Switch } from 'react-native'; import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; import { useSpeechToText, WHISPER_TINY_EN, - // Make sure Word is exported from your module Word, } from 'react-native-executorch'; import FontAwesome from '@expo/vector-icons/FontAwesome'; @@ -330,8 +647,11 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { model: WHISPER_TINY_EN, }); - // CHANGE 1: Update state to hold Word[] instead of string - const [transcription, setTranscription] = useState([]); + // CHANGE 1: State can now be string OR Word[] + const [transcription, setTranscription] = useState(''); + + // CHANGE 2: Add toggle for timestamps + const [enableTimestamps, setEnableTimestamps] = useState(false); const [audioURL, setAudioURL] = useState(''); const [liveTranscribing, setLiveTranscribing] = useState(false); @@ -354,17 +674,20 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { AudioManager.requestRecordingPermissions(); }, []); - const getText = (words: Word[]) => { - return words - .map((w) => { - // Format: "hello (0.00s - 0.50s) " - // using toFixed(2) for cleaner timestamp display - return `${w.word} (${w.start.toFixed(2)}s - ${w.end.toFixed(2)}s)\n`; - }) + // CHANGE 3: Smart helper that handles both formats + const getText = (data: string | Word[] | undefined) => { + console.log('UI Received:', JSON.stringify(data)); + if (!data) return ''; + if (typeof data === 'string') return data; + + // It's Word[], format with timestamps + return data + .map((w) => `${w.word} (${w.start.toFixed(2)}s - ${w.end.toFixed(2)}s)\n`) .join(''); }; const handleTranscribeFromURL = async () => { + console.log('[1] UI: Button Pressed. Calling model.stream()...'); if (!audioURL.trim()) { console.warn('Please provide a valid audio file URL'); return; @@ -380,8 +703,20 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { try { const decodedAudioData = await audioContext.decodeAudioDataSource(uri); const audioBuffer = decodedAudioData.getChannelData(0); - // model.transcribe now returns Word[], which matches our state type - setTranscription(await model.transcribe(audioBuffer)); + + // CHANGE 4: Pass the toggle flag to transcribe + // TypeScript will infer the return type based on the flag + if (enableTimestamps) { + const result = await model.transcribe(audioBuffer, { + enableTimestamps: true, + }); + setTranscription(result); + } else { + const result = await model.transcribe(audioBuffer, { + enableTimestamps: false, + }); + setTranscription(result); + } } catch (error) { console.error('Error decoding audio data', error); console.warn('Note: Supported file formats: mp3, wav, flac'); @@ -391,14 +726,21 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { const handleStartTranscribeFromMicrophone = async () => { setLiveTranscribing(true); - setTranscription([]); // Reset to empty array + // Reset based on mode + setTranscription(enableTimestamps ? [] : ''); + recorder.onAudioReady(({ buffer }) => { model.streamInsert(buffer.getChannelData(0)); }); recorder.start(); try { - await model.stream(); + // CHANGE 5: Pass the toggle flag to stream + if (enableTimestamps) { + await model.stream({ enableTimestamps: true }); + } else { + await model.stream({ enableTimestamps: false }); + } } catch (error) { console.error('Error during live transcription:', error); } @@ -421,12 +763,16 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { const readyToTranscribe = !model.isGenerating && model.isReady; const recordingButtonDisabled = isSimulator || !readyToTranscribe; - // CHANGE 3: Prepare the text for rendering - const displayedText = - transcription.length > 0 - ? getText(transcription) - : getText(model.committedTranscription) + - getText(model.nonCommittedTranscription); + // CHANGE 6: Logic to choose what text to display + // We use getText() on everything so it converts Arrays to Strings before concatenation + const hasResult = Array.isArray(transcription) + ? transcription.length > 0 + : transcription.length > 0; + + const displayedText = hasResult + ? getText(transcription) + : getText(model.committedTranscription) + + getText(model.nonCommittedTranscription); return ( @@ -448,6 +794,21 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { Status: {getModelStatus()} + {/* CHANGE 7: Add UI for the Toggle */} + + Enable Timestamps + { + setEnableTimestamps(val); + setTranscription(val ? [] : ''); // Reset transcription on toggle + }} + trackColor={{ false: '#767577', true: '#0f186e' }} + thumbColor={enableTimestamps ? '#fff' : '#f4f3f4'} + disabled={model.isGenerating} // Disable changing mode while running + /> + + Transcription void }) => { }; const styles = StyleSheet.create({ + // ... existing styles ... container: { flex: 1, alignItems: 'center', @@ -546,6 +908,18 @@ const styles = StyleSheet.create({ marginTop: 12, alignItems: 'center', }, + // New style for the toggle + toggleContainer: { + flexDirection: 'row', + alignItems: 'center', + marginTop: 10, + marginBottom: 5, + }, + toggleLabel: { + fontSize: 16, + marginRight: 10, + color: '#0f186e', + }, transcriptionContainer: { flex: 1, width: '100%', diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h index c71a58e41..f0ec05b64 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h @@ -75,6 +75,11 @@ template class ModelHostObject : public JsiHostObject { promiseHostFunction<&Model::transcribe>, "transcribe")); + addFunctions( + JSI_EXPORT_FUNCTION(ModelHostObject, + promiseHostFunction<&Model::transcribeStringOnly>, + "transcribeStringOnly")); + addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject, promiseHostFunction<&Model::stream>, "stream")); diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp index a97edfcb9..68e63d612 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp @@ -44,7 +44,8 @@ SpeechToText::decode(std::span tokens, } // std::vector SpeechToText::transcribe(std::span waveform, -// std::string languageOption) const { +// std::string languageOption) const +// { // std::vector segments = // this->asr->transcribe(waveform, DecodingOptions(languageOption)); // std::string transcription; @@ -85,21 +86,47 @@ std::vector SpeechToText::transcribe(std::span waveform, } } - auto wordsToJsi = [](jsi::Runtime &rt, - const std::vector &words) -> jsi::Value { - jsi::Array jsiArr(rt, words.size()); - for (size_t i = 0; i < words.size(); ++i) { - jsi::Object obj(rt); - obj.setProperty(rt, "word", - jsi::String::createFromUtf8(rt, words[i].content)); - obj.setProperty(rt, "start", static_cast(words[i].start)); - obj.setProperty(rt, "end", static_cast(words[i].end)); - jsiArr.setValueAtIndex(rt, i, obj); + return transcription; +} + +std::vector +SpeechToText::transcribeStringOnly(std::span waveform, + std::string languageOption) const { + std::vector segments = + this->asr->transcribe(waveform, DecodingOptions(languageOption)); + std::string transcription; + + size_t transcriptionLength = 0; + for (auto &segment : segments) { + for (auto &word : segment.words) { + transcriptionLength += word.content.size(); } - return jsiArr; - }; + } + transcription.reserve(transcriptionLength); - return transcription; + for (auto &segment : segments) { + for (auto &word : segment.words) { + transcription += word.content; + } + } + + return {transcription.begin(), transcription.end()}; +} + +std::vector mergeWordsToString(const std::vector &words) { + std::string result; + size_t totalLength = 0; + + for (const auto &word : words) { + totalLength += word.content.size(); + } + result.reserve(totalLength); + + for (const auto &word : words) { + result += word.content; + } + + return {result.begin(), result.end()}; } size_t SpeechToText::getMemoryLowerBound() const noexcept { @@ -153,38 +180,25 @@ size_t SpeechToText::getMemoryLowerBound() const noexcept { // } void SpeechToText::stream(std::shared_ptr callback, - std::string languageOption) { + std::string languageOption, bool enableTimestamps) { if (this->isStreaming) { throw RnExecutorchError(RnExecutorchErrorCode::StreamingInProgress, "Streaming is already in progress!"); } - auto wordsToJsi = [](jsi::Runtime &rt, - const std::vector &words) -> jsi::Value { - jsi::Array jsiArr(rt, words.size()); - for (size_t i = 0; i < words.size(); ++i) { - jsi::Object obj(rt); - obj.setProperty(rt, "word", - jsi::String::createFromUtf8(rt, words[i].content)); - obj.setProperty(rt, "start", static_cast(words[i].start)); - obj.setProperty(rt, "end", static_cast(words[i].end)); - jsiArr.setValueAtIndex(rt, i, obj); - } - return jsiArr; - }; - - auto nativeCallback = [this, callback, - wordsToJsi](const std::vector &committedVec, - const std::vector &nonCommittedVec, - bool isDone) { - this->callInvoker->invokeAsync([callback, committedVec, nonCommittedVec, - isDone, wordsToJsi](jsi::Runtime &rt) { - jsi::Value committedJsi = wordsToJsi(rt, committedVec); - jsi::Value nonCommittedJsi = wordsToJsi(rt, nonCommittedVec); - - callback->call(rt, std::move(committedJsi), std::move(nonCommittedJsi), - jsi::Value(isDone)); - }); + auto nativeCallback = [this, callback](const auto &committedVec, + const auto &nonCommittedVec, + bool isDone) { + this->callInvoker->invokeAsync( + [callback, committedVec, nonCommittedVec, isDone](jsi::Runtime &rt) { + jsi::Value committedJsi = + rnexecutorch::jsi_conversion::getJsiValue(committedVec, rt); + jsi::Value nonCommittedJsi = + rnexecutorch::jsi_conversion::getJsiValue(nonCommittedVec, rt); + + callback->call(rt, std::move(committedJsi), + std::move(nonCommittedJsi), jsi::Value(isDone)); + }); }; this->isStreaming = true; @@ -197,14 +211,23 @@ void SpeechToText::stream(std::shared_ptr callback, ProcessResult res = this->processor->processIter(DecodingOptions(languageOption)); - nativeCallback(res.committed, res.nonCommitted, false); + if (enableTimestamps) { + nativeCallback(res.committed, res.nonCommitted, false); + } else { + nativeCallback(mergeWordsToString(res.committed), + mergeWordsToString(res.nonCommitted), false); + } this->readyToProcess = false; } // finish() now returns std::vector std::vector committed = this->processor->finish(); - nativeCallback(committed, {}, true); + if (enableTimestamps) { + nativeCallback(committed, std::vector{}, true); + } else { + nativeCallback(mergeWordsToString(committed), std::vector(), true); + } this->resetStreamState(); } diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h index af02a5357..8f6799c4e 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h @@ -27,11 +27,15 @@ class SpeechToText { [[nodiscard("Registered non-void function")]] std::vector transcribe(std::span waveform, std::string languageOption) const; + [[nodiscard("Registered non-void function")]] + std::vector transcribeStringOnly(std::span waveform, + std::string languageOption) const; + size_t getMemoryLowerBound() const noexcept; // Stream void stream(std::shared_ptr callback, - std::string languageOption); + std::string languageOption, bool enableTimestamps); void streamStop(); void streamInsert(std::span waveform); diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts index 6f22bf7b5..7f42d33cb 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts @@ -136,8 +136,164 @@ // }; // }; +// import { useEffect, useCallback, useState } from 'react'; +// // Make sure Word is exported from your module file +// import { +// SpeechToTextModule, +// Word, +// } from '../../modules/natural_language_processing/SpeechToTextModule'; +// import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt'; +// import { RnExecutorchErrorCode } from '../../errors/ErrorCodes'; +// import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils'; + +// export const useSpeechToText = ({ +// model, +// preventLoad = false, +// }: { +// model: SpeechToTextModelConfig; +// preventLoad?: boolean; +// }) => { +// const [error, setError] = useState(null); +// const [isReady, setIsReady] = useState(false); +// const [isGenerating, setIsGenerating] = useState(false); +// const [downloadProgress, setDownloadProgress] = useState(0); + +// const [modelInstance] = useState(() => new SpeechToTextModule()); + +// // FIX 1: Initialize with empty array [], generic type Word[] +// const [committedTranscription, setCommittedTranscription] = useState( +// [] +// ); +// const [nonCommittedTranscription, setNonCommittedTranscription] = useState< +// Word[] +// >([]); + +// useEffect(() => { +// if (preventLoad) return; +// (async () => { +// setDownloadProgress(0); +// setError(null); +// try { +// setIsReady(false); +// await modelInstance.load( +// { +// isMultilingual: model.isMultilingual, +// encoderSource: model.encoderSource, +// decoderSource: model.decoderSource, +// tokenizerSource: model.tokenizerSource, +// }, +// setDownloadProgress +// ); +// setIsReady(true); +// } catch (err) { +// setError(parseUnknownError(err)); +// } +// })(); +// }, [ +// modelInstance, +// model.isMultilingual, +// model.encoderSource, +// model.decoderSource, +// model.tokenizerSource, +// preventLoad, +// ]); + +// const stateWrapper = useCallback( +// Promise>(fn: T) => +// async (...args: Parameters): Promise>> => { +// if (!isReady) +// throw new RnExecutorchError( +// RnExecutorchErrorCode.ModuleNotLoaded, +// 'The model is currently not loaded. Please load the model before calling this function.' +// ); +// if (isGenerating) +// throw new RnExecutorchError( +// RnExecutorchErrorCode.ModelGenerating, +// 'The model is currently generating. Please wait until previous model run is complete.' +// ); +// setIsGenerating(true); +// try { +// return await fn.apply(modelInstance, args); +// } finally { +// setIsGenerating(false); +// } +// }, +// [isReady, isGenerating, modelInstance] +// ); + +// const stream = useCallback( +// async (options?: DecodingOptions) => { +// if (!isReady) +// throw new RnExecutorchError( +// RnExecutorchErrorCode.ModuleNotLoaded, +// 'The model is currently not loaded. Please load the model before calling this function.' +// ); +// if (isGenerating) +// throw new RnExecutorchError( +// RnExecutorchErrorCode.ModelGenerating, +// 'The model is currently generating. Please wait until previous model run is complete.' +// ); +// setIsGenerating(true); + +// // FIX 2: Reset to empty arrays +// setCommittedTranscription([]); +// setNonCommittedTranscription([]); + +// // Accumulator is now an array of Words, not a string +// const fullResult: Word[] = []; + +// try { +// for await (const { committed, nonCommitted } of modelInstance.stream( +// options +// )) { +// // FIX 3: Update state by appending arrays +// if (committed.length > 0) { +// setCommittedTranscription((prev) => [...prev, ...committed]); +// fullResult.push(...committed); +// } + +// // nonCommitted is always a fresh partial chunk +// setNonCommittedTranscription(nonCommitted); +// } +// } finally { +// setIsGenerating(false); +// } +// return fullResult; +// }, +// [isReady, isGenerating, modelInstance] +// ); + +// const wrapper = useCallback( +// any>(fn: T) => { +// return (...args: Parameters): ReturnType => { +// if (!isReady) +// throw new RnExecutorchError( +// RnExecutorchErrorCode.ModuleNotLoaded, +// 'The model is currently not loaded. Please load the model before calling this function.' +// ); +// return fn.apply(modelInstance, args); +// }; +// }, +// [isReady, modelInstance] +// ); + +// return { +// error, +// isReady, +// isGenerating, +// downloadProgress, +// committedTranscription, +// nonCommittedTranscription, +// encode: stateWrapper(SpeechToTextModule.prototype.encode), +// decode: stateWrapper(SpeechToTextModule.prototype.decode), +// transcribe: stateWrapper(SpeechToTextModule.prototype.transcribe), +// stream, +// streamStop: wrapper(SpeechToTextModule.prototype.streamStop), +// streamInsert: wrapper(SpeechToTextModule.prototype.streamInsert), +// }; +// }; + import { useEffect, useCallback, useState } from 'react'; -// Make sure Word is exported from your module file import { SpeechToTextModule, Word, @@ -160,13 +316,13 @@ export const useSpeechToText = ({ const [modelInstance] = useState(() => new SpeechToTextModule()); - // FIX 1: Initialize with empty array [], generic type Word[] - const [committedTranscription, setCommittedTranscription] = useState( - [] - ); + // FIX 1: Allow state to be either string or Word[] + const [committedTranscription, setCommittedTranscription] = useState< + string | Word[] + >(''); const [nonCommittedTranscription, setNonCommittedTranscription] = useState< - Word[] - >([]); + string | Word[] + >(''); useEffect(() => { if (preventLoad) return; @@ -189,14 +345,7 @@ export const useSpeechToText = ({ setError(parseUnknownError(err)); } })(); - }, [ - modelInstance, - model.isMultilingual, - model.encoderSource, - model.decoderSource, - model.tokenizerSource, - preventLoad, - ]); + }, [modelInstance, model, preventLoad]); const stateWrapper = useCallback( Promise>(fn: T) => @@ -204,12 +353,12 @@ export const useSpeechToText = ({ if (!isReady) throw new RnExecutorchError( RnExecutorchErrorCode.ModuleNotLoaded, - 'The model is currently not loaded. Please load the model before calling this function.' + 'The model is currently not loaded.' ); if (isGenerating) throw new RnExecutorchError( RnExecutorchErrorCode.ModelGenerating, - 'The model is currently generating. Please wait until previous model run is complete.' + 'The model is currently generating.' ); setIsGenerating(true); try { @@ -222,38 +371,66 @@ export const useSpeechToText = ({ ); const stream = useCallback( - async (options?: DecodingOptions) => { + async (options?: DecodingOptions & { enableTimestamps?: boolean }) => { + console.log( + '[2] Hook: Stream called. Ready:', + isReady, + 'Generating:', + isGenerating + ); if (!isReady) throw new RnExecutorchError( RnExecutorchErrorCode.ModuleNotLoaded, - 'The model is currently not loaded. Please load the model before calling this function.' + 'Model not loaded' ); if (isGenerating) throw new RnExecutorchError( RnExecutorchErrorCode.ModelGenerating, - 'The model is currently generating. Please wait until previous model run is complete.' + 'Model is generating' ); + setIsGenerating(true); - // FIX 2: Reset to empty arrays - setCommittedTranscription([]); - setNonCommittedTranscription([]); + // FIX 2: Reset based on the mode requested + const enableTimestamps = options?.enableTimestamps ?? false; + setCommittedTranscription(enableTimestamps ? [] : ''); + setNonCommittedTranscription(enableTimestamps ? [] : ''); - // Accumulator is now an array of Words, not a string - const fullResult: Word[] = []; + let fullResult: string | Word[] = enableTimestamps ? [] : ''; try { + console.log('[3] Hook: Calling modelInstance.stream()'); + // @ts-ignore - Typescript struggles with the dual generator return type, but logic is safe for await (const { committed, nonCommitted } of modelInstance.stream( options )) { - // FIX 3: Update state by appending arrays - if (committed.length > 0) { - setCommittedTranscription((prev) => [...prev, ...committed]); - fullResult.push(...committed); - } + console.log(committed, nonCommitted); + // FIX 3: Dynamic Merging Logic + if (typeof committed === 'string') { + // --- STRING MODE --- + if (committed.length > 0) { + setCommittedTranscription((prev) => { + // Safety check: if prev was somehow an array, reset it or cast to string + const prevStr = typeof prev === 'string' ? prev : ''; + return prevStr + committed; + }); + (fullResult as string) += committed; + } + setNonCommittedTranscription(nonCommitted as string); + } else { + // --- WORD[] MODE --- + const committedWords = committed as Word[]; + const nonCommittedWords = nonCommitted as Word[]; - // nonCommitted is always a fresh partial chunk - setNonCommittedTranscription(nonCommitted); + if (committedWords.length > 0) { + setCommittedTranscription((prev) => { + const prevArr = Array.isArray(prev) ? prev : []; + return [...prevArr, ...committedWords]; + }); + (fullResult as Word[]).push(...committedWords); + } + setNonCommittedTranscription(nonCommittedWords); + } } } finally { setIsGenerating(false); @@ -269,7 +446,7 @@ export const useSpeechToText = ({ if (!isReady) throw new RnExecutorchError( RnExecutorchErrorCode.ModuleNotLoaded, - 'The model is currently not loaded. Please load the model before calling this function.' + 'Model not loaded' ); return fn.apply(modelInstance, args); }; diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts index e0ca88251..5d5fd3248 100644 --- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts +++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts @@ -1,194 +1,3 @@ -// import { Logger } from '../../common/Logger'; -// import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt'; -// import { ResourceFetcher } from '../../utils/ResourceFetcher'; -// import { RnExecutorchErrorCode } from '../../errors/ErrorCodes'; -// import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils'; - -// export class SpeechToTextModule { -// private nativeModule: any; - -// private modelConfig!: SpeechToTextModelConfig; - -// private textDecoder = new TextDecoder('utf-8', { -// fatal: false, -// ignoreBOM: true, -// }); - -// public async load( -// model: SpeechToTextModelConfig, -// onDownloadProgressCallback: (progress: number) => void = () => {} -// ) { -// this.modelConfig = model; - -// const tokenizerLoadPromise = ResourceFetcher.fetch( -// undefined, -// model.tokenizerSource -// ); -// const encoderDecoderPromise = ResourceFetcher.fetch( -// onDownloadProgressCallback, -// model.encoderSource, -// model.decoderSource -// ); -// const [tokenizerSources, encoderDecoderResults] = await Promise.all([ -// tokenizerLoadPromise, -// encoderDecoderPromise, -// ]); -// const encoderSource = encoderDecoderResults?.[0]; -// const decoderSource = encoderDecoderResults?.[1]; -// if (!encoderSource || !decoderSource || !tokenizerSources) { -// throw new RnExecutorchError( -// RnExecutorchErrorCode.DownloadInterrupted, -// 'The download has been interrupted. As a result, not every file was downloaded. Please retry the download.' -// ); -// } -// this.nativeModule = await global.loadSpeechToText( -// encoderSource, -// decoderSource, -// tokenizerSources[0]! -// ); -// } - -// public delete(): void { -// this.nativeModule.unload(); -// } - -// public async encode( -// waveform: Float32Array | number[] -// ): Promise { -// if (Array.isArray(waveform)) { -// Logger.info( -// 'Passing waveform as number[] is deprecated, use Float32Array instead' -// ); -// waveform = new Float32Array(waveform); -// } -// return new Float32Array(await this.nativeModule.encode(waveform)); -// } - -// public async decode( -// tokens: Int32Array | number[], -// encoderOutput: Float32Array | number[] -// ): Promise { -// if (Array.isArray(tokens)) { -// Logger.info( -// 'Passing tokens as number[] is deprecated, use Int32Array instead' -// ); -// tokens = new Int32Array(tokens); -// } -// if (Array.isArray(encoderOutput)) { -// Logger.info( -// 'Passing encoderOutput as number[] is deprecated, use Float32Array instead' -// ); -// encoderOutput = new Float32Array(encoderOutput); -// } -// return new Float32Array( -// await this.nativeModule.decode(tokens, encoderOutput) -// ); -// } - -// public async transcribe( -// waveform: Float32Array | number[], -// options: DecodingOptions = {} -// ): Promise { -// this.validateOptions(options); - -// if (Array.isArray(waveform)) { -// Logger.info( -// 'Passing waveform as number[] is deprecated, use Float32Array instead' -// ); -// waveform = new Float32Array(waveform); -// } -// const transcriptionBytes = await this.nativeModule.transcribe( -// waveform, -// options.language || '' -// ); -// return this.textDecoder.decode(new Uint8Array(transcriptionBytes)); -// } - -// public async *stream( -// options: DecodingOptions = {} -// ): AsyncGenerator<{ committed: string; nonCommitted: string }> { -// this.validateOptions(options); - -// const queue: { committed: string; nonCommitted: string }[] = []; -// let waiter: (() => void) | null = null; -// let finished = false; -// let error: unknown; - -// const wake = () => { -// waiter?.(); -// waiter = null; -// }; - -// (async () => { -// try { -// await this.nativeModule.stream( -// (committed: number[], nonCommitted: number[], isDone: boolean) => { -// queue.push({ -// committed: this.textDecoder.decode(new Uint8Array(committed)), -// nonCommitted: this.textDecoder.decode( -// new Uint8Array(nonCommitted) -// ), -// }); -// if (isDone) { -// finished = true; -// } -// wake(); -// }, -// options.language || '' -// ); -// finished = true; -// wake(); -// } catch (e) { -// error = e; -// finished = true; -// wake(); -// } -// })(); - -// while (true) { -// if (queue.length > 0) { -// yield queue.shift()!; -// if (finished && queue.length === 0) { -// return; -// } -// continue; -// } -// if (error) throw parseUnknownError(error); -// if (finished) return; -// await new Promise((r) => (waiter = r)); -// } -// } - -// public streamInsert(waveform: Float32Array | number[]): void { -// if (Array.isArray(waveform)) { -// Logger.info( -// 'Passing waveform as number[] is deprecated, use Float32Array instead' -// ); -// waveform = new Float32Array(waveform); -// } -// this.nativeModule.streamInsert(waveform); -// } - -// public streamStop(): void { -// this.nativeModule.streamStop(); -// } - -// private validateOptions(options: DecodingOptions) { -// if (!this.modelConfig.isMultilingual && options.language) { -// throw new RnExecutorchError( -// RnExecutorchErrorCode.InvalidConfig, -// 'Model is not multilingual, cannot set language' -// ); -// } -// if (this.modelConfig.isMultilingual && !options.language) { -// throw new RnExecutorchError( -// RnExecutorchErrorCode.InvalidConfig, -// 'Model is multilingual, provide a language' -// ); -// } -// } -// } - import { Logger } from '../../common/Logger'; import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt'; import { ResourceFetcher } from '../../utils/ResourceFetcher'; @@ -206,7 +15,10 @@ export class SpeechToTextModule { private nativeModule: any; private modelConfig!: SpeechToTextModelConfig; - // 2. TextDecoder is removed as C++ now returns JS objects directly + private textDecoder = new TextDecoder('utf-8', { + fatal: false, + ignoreBOM: true, + }); public async load( model: SpeechToTextModelConfig, @@ -279,11 +91,20 @@ export class SpeechToTextModule { ); } - // 3. Update transcribe to return Word[] instead of string + public async transcribe( + waveform: Float32Array | number[], + options?: DecodingOptions & { enableTimestamps: true } + ): Promise; + + public async transcribe( + waveform: Float32Array | number[], + options?: DecodingOptions & { enableTimestamps?: false | undefined } + ): Promise; + public async transcribe( waveform: Float32Array | number[], options: DecodingOptions = {} - ): Promise { + ): Promise { this.validateOptions(options); if (Array.isArray(waveform)) { @@ -293,22 +114,45 @@ export class SpeechToTextModule { waveform = new Float32Array(waveform); } - // The native module now returns an Array of Objects, not bytes - const transcription: Word[] = await this.nativeModule.transcribe( - waveform, - options.language || '' - ); + const language = options.language || ''; + + if (options.enableTimestamps) { + return await this.nativeModule.transcribe(waveform, language); + } else { + const transcriptionBytes = await this.nativeModule.transcribeStringOnly( + waveform, + language + ); - return transcription; + return this.textDecoder.decode(new Uint8Array(transcriptionBytes)); + } } - // 4. Update stream to yield Word[] structure + public stream( + options: DecodingOptions & { enableTimestamps: true } + ): AsyncGenerator<{ committed: Word[]; nonCommitted: Word[] }>; + + public stream( + options?: DecodingOptions & { enableTimestamps?: false | undefined } + ): AsyncGenerator<{ committed: string; nonCommitted: string }>; + public async *stream( options: DecodingOptions = {} - ): AsyncGenerator<{ committed: Word[]; nonCommitted: Word[] }> { + ): AsyncGenerator<{ + committed: string | Word[]; + nonCommitted: string | Word[]; + }> { + console.log('[4] Module: Entered stream method'); this.validateOptions(options); - const queue: { committed: Word[]; nonCommitted: Word[] }[] = []; + // Ensure we strictly default to false + const enableTimestamps = options.enableTimestamps === true; + + const queue: { + committed: string | Word[]; + nonCommitted: string | Word[]; + }[] = []; + let waiter: (() => void) | null = null; let finished = false; let error: unknown; @@ -320,20 +164,34 @@ export class SpeechToTextModule { (async () => { try { - await this.nativeModule.stream( - // Callback now receives arrays of objects directly - (committed: Word[], nonCommitted: Word[], isDone: boolean) => { - queue.push({ - committed, - nonCommitted, - }); - if (isDone) { - finished = true; + const callback = ( + committed: any, + nonCommitted: any, + isDone: boolean + ) => { + if (!enableTimestamps) { + try { + queue.push({ + committed: this.textDecoder.decode(new Uint8Array(committed)), + nonCommitted: this.textDecoder.decode( + new Uint8Array(nonCommitted) + ), + }); + } catch (err) { + console.error('[Stream Decode Error]', err); } - wake(); - }, - options.language || '' - ); + } else { + queue.push({ committed, nonCommitted }); + } + + if (isDone) finished = true; + wake(); + }; + + const language = options.language || ''; + + await this.nativeModule.stream(callback, language, enableTimestamps); + finished = true; wake(); } catch (e) { diff --git a/packages/react-native-executorch/src/types/stt.ts b/packages/react-native-executorch/src/types/stt.ts index 20627ca11..8f95eb16d 100644 --- a/packages/react-native-executorch/src/types/stt.ts +++ b/packages/react-native-executorch/src/types/stt.ts @@ -80,6 +80,7 @@ export type SpeechToTextLanguage = export interface DecodingOptions { language?: SpeechToTextLanguage; + enableTimestamps?: boolean; } export interface SpeechToTextModelConfig { From 9846acbe1f6099355b74a5874cc4985fac8afeb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Wed, 21 Jan 2026 15:54:28 +0100 Subject: [PATCH 05/14] Clear files --- apps/speech/screens/SpeechToTextScreen.tsx | 640 +----------------- .../host_objects/JsiConversions.h | 4 - .../models/speech_to_text/SpeechToText.cpp | 70 -- .../useSpeechToText.ts | 295 -------- .../SpeechToTextModule.ts | 3 - 5 files changed, 5 insertions(+), 1007 deletions(-) diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index 9dab4420b..f844241f3 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -1,617 +1,3 @@ -// import React, { useEffect, useRef, useState } from 'react'; -// import { -// Text, -// View, -// StyleSheet, -// TouchableOpacity, -// ScrollView, -// TextInput, -// KeyboardAvoidingView, -// Platform, -// } from 'react-native'; -// import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; -// import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch'; -// import FontAwesome from '@expo/vector-icons/FontAwesome'; -// import { -// AudioManager, -// AudioRecorder, -// AudioContext, -// } from 'react-native-audio-api'; -// import * as FileSystem from 'expo-file-system/legacy'; -// import SWMIcon from '../assets/swm_icon.svg'; -// import DeviceInfo from 'react-native-device-info'; - -// const isSimulator = DeviceInfo.isEmulatorSync(); - -// export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { -// const model = useSpeechToText({ -// model: WHISPER_TINY_EN, -// }); - -// const [transcription, setTranscription] = useState(''); -// const [audioURL, setAudioURL] = useState(''); -// const [liveTranscribing, setLiveTranscribing] = useState(false); -// const scrollViewRef = useRef(null); - -// const [recorder] = useState( -// () => -// new AudioRecorder({ -// sampleRate: 16000, -// bufferLengthInSamples: 1600, -// }) -// ); - -// useEffect(() => { -// AudioManager.setAudioSessionOptions({ -// iosCategory: 'playAndRecord', -// iosMode: 'spokenAudio', -// iosOptions: ['allowBluetooth', 'defaultToSpeaker'], -// }); -// AudioManager.requestRecordingPermissions(); -// }, []); - -// const handleTranscribeFromURL = async () => { -// if (!audioURL.trim()) { -// console.warn('Please provide a valid audio file URL'); -// return; -// } - -// const { uri } = await FileSystem.downloadAsync( -// audioURL, -// FileSystem.cacheDirectory + 'audio_file' -// ); - -// const audioContext = new AudioContext({ sampleRate: 16000 }); - -// try { -// const decodedAudioData = await audioContext.decodeAudioDataSource(uri); -// const audioBuffer = decodedAudioData.getChannelData(0); -// setTranscription(await model.transcribe(audioBuffer)); -// } catch (error) { -// console.error('Error decoding audio data', error); -// console.warn('Note: Supported file formats: mp3, wav, flac'); -// return; -// } -// }; - -// const handleStartTranscribeFromMicrophone = async () => { -// setLiveTranscribing(true); -// setTranscription(''); -// recorder.onAudioReady(({ buffer }) => { -// model.streamInsert(buffer.getChannelData(0)); -// }); -// recorder.start(); - -// try { -// await model.stream(); -// } catch (error) { -// console.error('Error during live transcription:', error); -// } -// }; - -// const handleStopTranscribeFromMicrophone = () => { -// recorder.stop(); -// model.streamStop(); -// console.log('Live transcription stopped'); -// setLiveTranscribing(false); -// }; - -// const getModelStatus = () => { -// if (model.error) return `${model.error}`; -// if (model.isGenerating) return 'Transcribing...'; -// if (model.isReady) return 'Ready to transcribe'; -// return `Loading model: ${(100 * model.downloadProgress).toFixed(2)}%`; -// }; - -// const readyToTranscribe = !model.isGenerating && model.isReady; -// const recordingButtonDisabled = isSimulator || !readyToTranscribe; - -// return ( -// -// -// -// -// -// -// -// -// React Native ExecuTorch -// Speech to Text -// - -// -// Status: {getModelStatus()} -// - -// -// Transcription -// -// scrollViewRef.current?.scrollToEnd({ animated: true }) -// } -// > -// -// {transcription !== '' -// ? transcription -// : model.committedTranscription + -// model.nonCommittedTranscription} -// -// -// - -// -// -// -// -// Start -// -// - -// {liveTranscribing ? ( -// -// -// Stop Live Transcription -// -// ) : ( -// -// -// -// {isSimulator -// ? 'Recording is not available on Simulator' -// : 'Start Live Transcription'} -// -// -// )} -// -// -// -// -// ); -// }; - -// const styles = StyleSheet.create({ -// container: { -// flex: 1, -// alignItems: 'center', -// backgroundColor: 'white', -// paddingHorizontal: 16, -// }, -// keyboardAvoidingView: { -// flex: 1, -// width: '100%', -// }, -// header: { -// alignItems: 'center', -// position: 'relative', -// width: '100%', -// }, -// backButton: { -// position: 'absolute', -// left: 0, -// top: 10, -// padding: 10, -// zIndex: 1, -// }, -// headerText: { -// fontSize: 22, -// fontWeight: 'bold', -// color: '#0f186e', -// }, -// statusContainer: { -// marginTop: 12, -// alignItems: 'center', -// }, -// transcriptionContainer: { -// flex: 1, -// width: '100%', -// marginVertical: 12, -// }, -// transcriptionLabel: { -// marginLeft: 12, -// marginBottom: 4, -// color: '#0f186e', -// }, -// transcriptionScrollContainer: { -// borderRadius: 12, -// borderWidth: 1, -// borderColor: '#0f186e', -// padding: 12, -// }, -// inputContainer: { -// marginBottom: 12, -// }, -// urlTranscriptionContainer: { -// width: '100%', -// flexDirection: 'row', -// }, -// urlTranscriptionInput: { -// flex: 1, -// padding: 12, -// borderTopLeftRadius: 12, -// borderBottomLeftRadius: 12, -// borderWidth: 1, -// borderColor: '#0f186e', -// borderRightWidth: 0, -// }, -// urlTranscriptionButton: { -// backgroundColor: '#0f186e', -// justifyContent: 'center', -// alignItems: 'center', -// padding: 12, -// borderTopRightRadius: 12, -// borderBottomRightRadius: 12, -// }, -// buttonText: { -// color: 'white', -// fontWeight: '600', -// letterSpacing: -0.5, -// fontSize: 16, -// }, -// liveTranscriptionButton: { -// flexDirection: 'row', -// justifyContent: 'center', -// alignItems: 'center', -// padding: 12, -// borderRadius: 12, -// marginTop: 12, -// gap: 8, -// }, -// backgroundRed: { -// backgroundColor: 'red', -// }, -// backgroundBlue: { -// backgroundColor: '#0f186e', -// }, -// disabled: { -// opacity: 0.5, -// }, -// }); - -// import React, { useEffect, useRef, useState } from 'react'; -// import { -// Text, -// View, -// StyleSheet, -// TouchableOpacity, -// ScrollView, -// TextInput, -// KeyboardAvoidingView, -// Platform, -// } from 'react-native'; -// import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; -// import { -// useSpeechToText, -// WHISPER_TINY_EN, -// // Make sure Word is exported from your module -// Word, -// } from 'react-native-executorch'; -// import FontAwesome from '@expo/vector-icons/FontAwesome'; -// import { -// AudioManager, -// AudioRecorder, -// AudioContext, -// } from 'react-native-audio-api'; -// import * as FileSystem from 'expo-file-system/legacy'; -// import SWMIcon from '../assets/swm_icon.svg'; -// import DeviceInfo from 'react-native-device-info'; - -// const isSimulator = DeviceInfo.isEmulatorSync(); - -// export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { -// const model = useSpeechToText({ -// model: WHISPER_TINY_EN, -// }); - -// // CHANGE 1: Update state to hold Word[] instead of string -// const [transcription, setTranscription] = useState([]); - -// const [audioURL, setAudioURL] = useState(''); -// const [liveTranscribing, setLiveTranscribing] = useState(false); -// const scrollViewRef = useRef(null); - -// const [recorder] = useState( -// () => -// new AudioRecorder({ -// sampleRate: 16000, -// bufferLengthInSamples: 1600, -// }) -// ); - -// useEffect(() => { -// AudioManager.setAudioSessionOptions({ -// iosCategory: 'playAndRecord', -// iosMode: 'spokenAudio', -// iosOptions: ['allowBluetooth', 'defaultToSpeaker'], -// }); -// AudioManager.requestRecordingPermissions(); -// }, []); - -// const getText = (words: Word[]) => { -// return words -// .map((w) => { -// // Format: "hello (0.00s - 0.50s) " -// // using toFixed(2) for cleaner timestamp display -// return `${w.word} (${w.start.toFixed(2)}s - ${w.end.toFixed(2)}s)\n`; -// }) -// .join(''); -// }; - -// const handleTranscribeFromURL = async () => { -// if (!audioURL.trim()) { -// console.warn('Please provide a valid audio file URL'); -// return; -// } - -// const { uri } = await FileSystem.downloadAsync( -// audioURL, -// FileSystem.cacheDirectory + 'audio_file' -// ); - -// const audioContext = new AudioContext({ sampleRate: 16000 }); - -// try { -// const decodedAudioData = await audioContext.decodeAudioDataSource(uri); -// const audioBuffer = decodedAudioData.getChannelData(0); -// // model.transcribe now returns Word[], which matches our state type -// setTranscription(await model.transcribe(audioBuffer)); -// } catch (error) { -// console.error('Error decoding audio data', error); -// console.warn('Note: Supported file formats: mp3, wav, flac'); -// return; -// } -// }; - -// const handleStartTranscribeFromMicrophone = async () => { -// setLiveTranscribing(true); -// setTranscription([]); // Reset to empty array -// recorder.onAudioReady(({ buffer }) => { -// model.streamInsert(buffer.getChannelData(0)); -// }); -// recorder.start(); - -// try { -// await model.stream(); -// } catch (error) { -// console.error('Error during live transcription:', error); -// } -// }; - -// const handleStopTranscribeFromMicrophone = () => { -// recorder.stop(); -// model.streamStop(); -// console.log('Live transcription stopped'); -// setLiveTranscribing(false); -// }; - -// const getModelStatus = () => { -// if (model.error) return `${model.error}`; -// if (model.isGenerating) return 'Transcribing...'; -// if (model.isReady) return 'Ready to transcribe'; -// return `Loading model: ${(100 * model.downloadProgress).toFixed(2)}%`; -// }; - -// const readyToTranscribe = !model.isGenerating && model.isReady; -// const recordingButtonDisabled = isSimulator || !readyToTranscribe; - -// // CHANGE 3: Prepare the text for rendering -// const displayedText = -// transcription.length > 0 -// ? getText(transcription) -// : getText(model.committedTranscription) + -// getText(model.nonCommittedTranscription); - -// return ( -// -// -// -// -// -// -// -// -// React Native ExecuTorch -// Speech to Text -// - -// -// Status: {getModelStatus()} -// - -// -// Transcription -// -// scrollViewRef.current?.scrollToEnd({ animated: true }) -// } -// > -// {displayedText} -// -// - -// -// -// -// -// Start -// -// - -// {liveTranscribing ? ( -// -// -// Stop Live Transcription -// -// ) : ( -// -// -// -// {isSimulator -// ? 'Recording is not available on Simulator' -// : 'Start Live Transcription'} -// -// -// )} -// -// -// -// -// ); -// }; - -// const styles = StyleSheet.create({ -// container: { -// flex: 1, -// alignItems: 'center', -// backgroundColor: 'white', -// paddingHorizontal: 16, -// }, -// keyboardAvoidingView: { -// flex: 1, -// width: '100%', -// }, -// header: { -// alignItems: 'center', -// position: 'relative', -// width: '100%', -// }, -// backButton: { -// position: 'absolute', -// left: 0, -// top: 10, -// padding: 10, -// zIndex: 1, -// }, -// headerText: { -// fontSize: 22, -// fontWeight: 'bold', -// color: '#0f186e', -// }, -// statusContainer: { -// marginTop: 12, -// alignItems: 'center', -// }, -// transcriptionContainer: { -// flex: 1, -// width: '100%', -// marginVertical: 12, -// }, -// transcriptionLabel: { -// marginLeft: 12, -// marginBottom: 4, -// color: '#0f186e', -// }, -// transcriptionScrollContainer: { -// borderRadius: 12, -// borderWidth: 1, -// borderColor: '#0f186e', -// padding: 12, -// }, -// inputContainer: { -// marginBottom: 12, -// }, -// urlTranscriptionContainer: { -// width: '100%', -// flexDirection: 'row', -// }, -// urlTranscriptionInput: { -// flex: 1, -// padding: 12, -// borderTopLeftRadius: 12, -// borderBottomLeftRadius: 12, -// borderWidth: 1, -// borderColor: '#0f186e', -// borderRightWidth: 0, -// }, -// urlTranscriptionButton: { -// backgroundColor: '#0f186e', -// justifyContent: 'center', -// alignItems: 'center', -// padding: 12, -// borderTopRightRadius: 12, -// borderBottomRightRadius: 12, -// }, -// buttonText: { -// color: 'white', -// fontWeight: '600', -// letterSpacing: -0.5, -// fontSize: 16, -// }, -// liveTranscriptionButton: { -// flexDirection: 'row', -// justifyContent: 'center', -// alignItems: 'center', -// padding: 12, -// borderRadius: 12, -// marginTop: 12, -// gap: 8, -// }, -// backgroundRed: { -// backgroundColor: 'red', -// }, -// backgroundBlue: { -// backgroundColor: '#0f186e', -// }, -// disabled: { -// opacity: 0.5, -// }, -// }); - import React, { useEffect, useRef, useState } from 'react'; import { Text, @@ -622,7 +8,7 @@ import { TextInput, KeyboardAvoidingView, Platform, - Switch, // Import Switch + Switch, } from 'react-native'; import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; import { @@ -647,10 +33,8 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { model: WHISPER_TINY_EN, }); - // CHANGE 1: State can now be string OR Word[] const [transcription, setTranscription] = useState(''); - // CHANGE 2: Add toggle for timestamps const [enableTimestamps, setEnableTimestamps] = useState(false); const [audioURL, setAudioURL] = useState(''); @@ -674,20 +58,16 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { AudioManager.requestRecordingPermissions(); }, []); - // CHANGE 3: Smart helper that handles both formats const getText = (data: string | Word[] | undefined) => { - console.log('UI Received:', JSON.stringify(data)); if (!data) return ''; if (typeof data === 'string') return data; - // It's Word[], format with timestamps return data .map((w) => `${w.word} (${w.start.toFixed(2)}s - ${w.end.toFixed(2)}s)\n`) .join(''); }; const handleTranscribeFromURL = async () => { - console.log('[1] UI: Button Pressed. Calling model.stream()...'); if (!audioURL.trim()) { console.warn('Please provide a valid audio file URL'); return; @@ -708,12 +88,12 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { // TypeScript will infer the return type based on the flag if (enableTimestamps) { const result = await model.transcribe(audioBuffer, { - enableTimestamps: true, + enableTimestamps: true }); setTranscription(result); } else { const result = await model.transcribe(audioBuffer, { - enableTimestamps: false, + enableTimestamps: false }); setTranscription(result); } @@ -726,7 +106,6 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { const handleStartTranscribeFromMicrophone = async () => { setLiveTranscribing(true); - // Reset based on mode setTranscription(enableTimestamps ? [] : ''); recorder.onAudioReady(({ buffer }) => { @@ -735,12 +114,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { recorder.start(); try { - // CHANGE 5: Pass the toggle flag to stream - if (enableTimestamps) { - await model.stream({ enableTimestamps: true }); - } else { - await model.stream({ enableTimestamps: false }); - } + await model.stream({ enableTimestamps: enableTimestamps }); } catch (error) { console.error('Error during live transcription:', error); } @@ -763,11 +137,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { const readyToTranscribe = !model.isGenerating && model.isReady; const recordingButtonDisabled = isSimulator || !readyToTranscribe; - // CHANGE 6: Logic to choose what text to display - // We use getText() on everything so it converts Arrays to Strings before concatenation - const hasResult = Array.isArray(transcription) - ? transcription.length > 0 - : transcription.length > 0; + const hasResult = transcription.length > 0; const displayedText = hasResult ? getText(transcription) diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h index df7f635d9..95da364ff 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h @@ -69,15 +69,11 @@ template <> inline Word getValue(const jsi::Value &val, jsi::Runtime &runtime) { jsi::Object obj = val.asObject(runtime); - // 1. Extract the string "word" using the existing string helper std::string content = getValue(obj.getProperty(runtime, "word"), runtime); - // 2. Extract start/end times - // We use .asNumber() directly as these are primitives double start = obj.getProperty(runtime, "start").asNumber(); double end = obj.getProperty(runtime, "end").asNumber(); - // 3. Construct and return the C++ Word struct return Word{ .content = std::move(content), .start = static_cast(start), diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp index 68e63d612..aa9980bd8 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp @@ -43,30 +43,6 @@ SpeechToText::decode(std::span tokens, return std::make_shared(decoderOutput); } -// std::vector SpeechToText::transcribe(std::span waveform, -// std::string languageOption) const -// { -// std::vector segments = -// this->asr->transcribe(waveform, DecodingOptions(languageOption)); -// std::string transcription; - -// size_t transcriptionLength = 0; -// for (auto &segment : segments) { -// for (auto &word : segment.words) { -// transcriptionLength += word.content.size(); -// } -// } -// transcription.reserve(transcriptionLength); - -// for (auto &segment : segments) { -// for (auto &word : segment.words) { -// transcription += word.content; -// } -// } - -// return {transcription.begin(), transcription.end()}; -// } - std::vector SpeechToText::transcribe(std::span waveform, std::string languageOption) const { std::vector segments = @@ -134,51 +110,6 @@ size_t SpeechToText::getMemoryLowerBound() const noexcept { this->decoder->getMemoryLowerBound(); } -// void SpeechToText::stream(std::shared_ptr callback, -// std::string languageOption) { -// if (this->isStreaming) { -// throw RnExecutorchError(RnExecutorchErrorCode::StreamingInProgress, -// "Streaming is already in progress!"); -// } - -// auto nativeCallback = -// [this, callback](const std::vector &committedVec, -// const std::vector &nonCommittedVec, bool isDone) -// { -// this->callInvoker->invokeAsync([callback, committedVec, -// nonCommittedVec, -// isDone](jsi::Runtime &rt) { -// callback->call( -// rt, rnexecutorch::jsi_conversion::getJsiValue(committedVec, -// rt), rnexecutorch::jsi_conversion::getJsiValue(nonCommittedVec, -// rt), jsi::Value(isDone)); -// }); -// }; - -// this->isStreaming = true; -// while (this->isStreaming) { -// if (!this->readyToProcess || -// this->processor->audioBuffer.size() < SpeechToText::kMinAudioSamples) -// { -// std::this_thread::sleep_for(std::chrono::milliseconds(100)); -// continue; -// } -// ProcessResult res = -// this->processor->processIter(DecodingOptions(languageOption)); - -// nativeCallback({res.committed.begin(), res.committed.end()}, -// {res.nonCommitted.begin(), res.nonCommitted.end()}, -// false); -// this->readyToProcess = false; -// } - -// std::string committed = this->processor->finish(); - -// nativeCallback({committed.begin(), committed.end()}, {}, true); - -// this->resetStreamState(); -// } - void SpeechToText::stream(std::shared_ptr callback, std::string languageOption, bool enableTimestamps) { if (this->isStreaming) { @@ -220,7 +151,6 @@ void SpeechToText::stream(std::shared_ptr callback, this->readyToProcess = false; } - // finish() now returns std::vector std::vector committed = this->processor->finish(); if (enableTimestamps) { diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts index 7f42d33cb..40801c7b3 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts @@ -1,298 +1,3 @@ -// import { useEffect, useCallback, useState } from 'react'; -// import { SpeechToTextModule, Word } from '../../modules/natural_language_processing/SpeechToTextModule'; -// import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt'; -// import { RnExecutorchErrorCode } from '../../errors/ErrorCodes'; -// import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils'; - -// export const useSpeechToText = ({ -// model, -// preventLoad = false, -// }: { -// model: SpeechToTextModelConfig; -// preventLoad?: boolean; -// }) => { -// const [error, setError] = useState(null); -// const [isReady, setIsReady] = useState(false); -// const [isGenerating, setIsGenerating] = useState(false); -// const [downloadProgress, setDownloadProgress] = useState(0); - -// const [modelInstance] = useState(() => new SpeechToTextModule()); -// const [committedTranscription, setCommittedTranscription] = useState(Word); -// const [nonCommittedTranscription, setNonCommittedTranscription] = -// useState(Word); - -// useEffect(() => { -// if (preventLoad) return; -// (async () => { -// setDownloadProgress(0); -// setError(null); -// try { -// setIsReady(false); -// await modelInstance.load( -// { -// isMultilingual: model.isMultilingual, -// encoderSource: model.encoderSource, -// decoderSource: model.decoderSource, -// tokenizerSource: model.tokenizerSource, -// }, -// setDownloadProgress -// ); -// setIsReady(true); -// } catch (err) { -// setError(parseUnknownError(err)); -// } -// })(); -// }, [ -// modelInstance, -// model.isMultilingual, -// model.encoderSource, -// model.decoderSource, -// model.tokenizerSource, -// preventLoad, -// ]); - -// const stateWrapper = useCallback( -// Promise>(fn: T) => -// async (...args: Parameters): Promise>> => { -// if (!isReady) -// throw new RnExecutorchError( -// RnExecutorchErrorCode.ModuleNotLoaded, -// 'The model is currently not loaded. Please load the model before calling this function.' -// ); -// if (isGenerating) -// throw new RnExecutorchError( -// RnExecutorchErrorCode.ModelGenerating, -// 'The model is currently generating. Please wait until previous model run is complete.' -// ); -// setIsGenerating(true); -// try { -// return await fn.apply(modelInstance, args); -// } finally { -// setIsGenerating(false); -// } -// }, -// [isReady, isGenerating, modelInstance] -// ); - -// const stream = useCallback( -// async (options?: DecodingOptions) => { -// if (!isReady) -// throw new RnExecutorchError( -// RnExecutorchErrorCode.ModuleNotLoaded, -// 'The model is currently not loaded. Please load the model before calling this function.' -// ); -// if (isGenerating) -// throw new RnExecutorchError( -// RnExecutorchErrorCode.ModelGenerating, -// 'The model is currently generating. Please wait until previous model run is complete.' -// ); -// setIsGenerating(true); -// setCommittedTranscription(''); -// setNonCommittedTranscription(''); -// let transcription = ''; -// try { -// for await (const { committed, nonCommitted } of modelInstance.stream( -// options -// )) { -// setCommittedTranscription((prev) => prev + committed); -// setNonCommittedTranscription(nonCommitted); -// transcription += committed; -// } -// } finally { -// setIsGenerating(false); -// } -// return transcription; -// }, -// [isReady, isGenerating, modelInstance] -// ); - -// const wrapper = useCallback( -// any>(fn: T) => { -// return (...args: Parameters): ReturnType => { -// if (!isReady) -// throw new RnExecutorchError( -// RnExecutorchErrorCode.ModuleNotLoaded, -// 'The model is currently not loaded. Please load the model before calling this function.' -// ); -// return fn.apply(modelInstance, args); -// }; -// }, -// [isReady, modelInstance] -// ); - -// return { -// error, -// isReady, -// isGenerating, -// downloadProgress, -// committedTranscription, -// nonCommittedTranscription, -// encode: stateWrapper(SpeechToTextModule.prototype.encode), -// decode: stateWrapper(SpeechToTextModule.prototype.decode), -// transcribe: stateWrapper(SpeechToTextModule.prototype.transcribe), -// stream, -// streamStop: wrapper(SpeechToTextModule.prototype.streamStop), -// streamInsert: wrapper(SpeechToTextModule.prototype.streamInsert), -// }; -// }; - -// import { useEffect, useCallback, useState } from 'react'; -// // Make sure Word is exported from your module file -// import { -// SpeechToTextModule, -// Word, -// } from '../../modules/natural_language_processing/SpeechToTextModule'; -// import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt'; -// import { RnExecutorchErrorCode } from '../../errors/ErrorCodes'; -// import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils'; - -// export const useSpeechToText = ({ -// model, -// preventLoad = false, -// }: { -// model: SpeechToTextModelConfig; -// preventLoad?: boolean; -// }) => { -// const [error, setError] = useState(null); -// const [isReady, setIsReady] = useState(false); -// const [isGenerating, setIsGenerating] = useState(false); -// const [downloadProgress, setDownloadProgress] = useState(0); - -// const [modelInstance] = useState(() => new SpeechToTextModule()); - -// // FIX 1: Initialize with empty array [], generic type Word[] -// const [committedTranscription, setCommittedTranscription] = useState( -// [] -// ); -// const [nonCommittedTranscription, setNonCommittedTranscription] = useState< -// Word[] -// >([]); - -// useEffect(() => { -// if (preventLoad) return; -// (async () => { -// setDownloadProgress(0); -// setError(null); -// try { -// setIsReady(false); -// await modelInstance.load( -// { -// isMultilingual: model.isMultilingual, -// encoderSource: model.encoderSource, -// decoderSource: model.decoderSource, -// tokenizerSource: model.tokenizerSource, -// }, -// setDownloadProgress -// ); -// setIsReady(true); -// } catch (err) { -// setError(parseUnknownError(err)); -// } -// })(); -// }, [ -// modelInstance, -// model.isMultilingual, -// model.encoderSource, -// model.decoderSource, -// model.tokenizerSource, -// preventLoad, -// ]); - -// const stateWrapper = useCallback( -// Promise>(fn: T) => -// async (...args: Parameters): Promise>> => { -// if (!isReady) -// throw new RnExecutorchError( -// RnExecutorchErrorCode.ModuleNotLoaded, -// 'The model is currently not loaded. Please load the model before calling this function.' -// ); -// if (isGenerating) -// throw new RnExecutorchError( -// RnExecutorchErrorCode.ModelGenerating, -// 'The model is currently generating. Please wait until previous model run is complete.' -// ); -// setIsGenerating(true); -// try { -// return await fn.apply(modelInstance, args); -// } finally { -// setIsGenerating(false); -// } -// }, -// [isReady, isGenerating, modelInstance] -// ); - -// const stream = useCallback( -// async (options?: DecodingOptions) => { -// if (!isReady) -// throw new RnExecutorchError( -// RnExecutorchErrorCode.ModuleNotLoaded, -// 'The model is currently not loaded. Please load the model before calling this function.' -// ); -// if (isGenerating) -// throw new RnExecutorchError( -// RnExecutorchErrorCode.ModelGenerating, -// 'The model is currently generating. Please wait until previous model run is complete.' -// ); -// setIsGenerating(true); - -// // FIX 2: Reset to empty arrays -// setCommittedTranscription([]); -// setNonCommittedTranscription([]); - -// // Accumulator is now an array of Words, not a string -// const fullResult: Word[] = []; - -// try { -// for await (const { committed, nonCommitted } of modelInstance.stream( -// options -// )) { -// // FIX 3: Update state by appending arrays -// if (committed.length > 0) { -// setCommittedTranscription((prev) => [...prev, ...committed]); -// fullResult.push(...committed); -// } - -// // nonCommitted is always a fresh partial chunk -// setNonCommittedTranscription(nonCommitted); -// } -// } finally { -// setIsGenerating(false); -// } -// return fullResult; -// }, -// [isReady, isGenerating, modelInstance] -// ); - -// const wrapper = useCallback( -// any>(fn: T) => { -// return (...args: Parameters): ReturnType => { -// if (!isReady) -// throw new RnExecutorchError( -// RnExecutorchErrorCode.ModuleNotLoaded, -// 'The model is currently not loaded. Please load the model before calling this function.' -// ); -// return fn.apply(modelInstance, args); -// }; -// }, -// [isReady, modelInstance] -// ); - -// return { -// error, -// isReady, -// isGenerating, -// downloadProgress, -// committedTranscription, -// nonCommittedTranscription, -// encode: stateWrapper(SpeechToTextModule.prototype.encode), -// decode: stateWrapper(SpeechToTextModule.prototype.decode), -// transcribe: stateWrapper(SpeechToTextModule.prototype.transcribe), -// stream, -// streamStop: wrapper(SpeechToTextModule.prototype.streamStop), -// streamInsert: wrapper(SpeechToTextModule.prototype.streamInsert), -// }; -// }; - import { useEffect, useCallback, useState } from 'react'; import { SpeechToTextModule, diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts index 5d5fd3248..1ad2f6c97 100644 --- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts +++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts @@ -4,7 +4,6 @@ import { ResourceFetcher } from '../../utils/ResourceFetcher'; import { RnExecutorchErrorCode } from '../../errors/ErrorCodes'; import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils'; -// 1. Define the Word interface matching your C++ JSI object structure export interface Word { word: string; start: number; @@ -142,10 +141,8 @@ export class SpeechToTextModule { committed: string | Word[]; nonCommitted: string | Word[]; }> { - console.log('[4] Module: Entered stream method'); this.validateOptions(options); - // Ensure we strictly default to false const enableTimestamps = options.enableTimestamps === true; const queue: { From 4e1ae51f2d638daf7a05454f002ad6bd3c419a96 Mon Sep 17 00:00:00 2001 From: Mateusz Sluszniak <56299341+msluszniak@users.noreply.github.com> Date: Wed, 21 Jan 2026 16:36:46 +0100 Subject: [PATCH 06/14] Apply suggestions from code review --- apps/speech/screens/SpeechToTextScreen.tsx | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index f844241f3..ab3cdefb0 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -88,12 +88,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { // TypeScript will infer the return type based on the flag if (enableTimestamps) { const result = await model.transcribe(audioBuffer, { - enableTimestamps: true - }); - setTranscription(result); - } else { - const result = await model.transcribe(audioBuffer, { - enableTimestamps: false + enableTimestamps: enableTimestamps }); setTranscription(result); } @@ -164,18 +159,17 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { Status: {getModelStatus()} - {/* CHANGE 7: Add UI for the Toggle */} Enable Timestamps { setEnableTimestamps(val); - setTranscription(val ? [] : ''); // Reset transcription on toggle + setTranscription(val ? [] : ''); }} trackColor={{ false: '#767577', true: '#0f186e' }} thumbColor={enableTimestamps ? '#fff' : '#f4f3f4'} - disabled={model.isGenerating} // Disable changing mode while running + disabled={model.isGenerating} /> From b6bfdb72385724e9cff91da6d55a613d1d20c124 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Wed, 21 Jan 2026 16:46:49 +0100 Subject: [PATCH 07/14] Apply further clearing --- apps/speech/screens/SpeechToTextScreen.tsx | 4 --- .../host_objects/JsiConversions.h | 1 - .../host_objects/ModelHostObject.h | 3 --- .../models/speech_to_text/SpeechToText.h | 1 - .../stream/OnlineASRProcessor.cpp | 25 ++++++------------- .../stream/OnlineASRProcessor.h | 3 --- .../speech_to_text/types/ProcessResult.h | 5 ---- .../useSpeechToText.ts | 14 ----------- 8 files changed, 7 insertions(+), 49 deletions(-) diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index ab3cdefb0..4be72abf4 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -84,8 +84,6 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { const decodedAudioData = await audioContext.decodeAudioDataSource(uri); const audioBuffer = decodedAudioData.getChannelData(0); - // CHANGE 4: Pass the toggle flag to transcribe - // TypeScript will infer the return type based on the flag if (enableTimestamps) { const result = await model.transcribe(audioBuffer, { enableTimestamps: enableTimestamps @@ -240,7 +238,6 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { }; const styles = StyleSheet.create({ - // ... existing styles ... container: { flex: 1, alignItems: 'center', @@ -272,7 +269,6 @@ const styles = StyleSheet.create({ marginTop: 12, alignItems: 'center', }, - // New style for the toggle toggleContainer: { flexDirection: 'row', alignItems: 'center', diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h index 95da364ff..e95e930e7 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h @@ -320,7 +320,6 @@ inline jsi::Value getJsiValue(const Word &word, jsi::Runtime &runtime) { inline jsi::Value getJsiValue(const std::vector &vec, jsi::Runtime &runtime) { jsi::Array array(runtime, vec.size()); for (size_t i = 0; i < vec.size(); ++i) { - // Convert each Word using the helper above and place in array array.setValueAtIndex(runtime, i, getJsiValue(vec[i], runtime)); } return {runtime, array}; diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h index f0ec05b64..1843e8672 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h @@ -20,14 +20,11 @@ #include #include #include -#include #include #include #include #include -using rnexecutorch::models::speech_to_text::types::Word; - namespace rnexecutorch { template class ModelHostObject : public JsiHostObject { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h index 8f6799c4e..883436f4a 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h @@ -23,7 +23,6 @@ class SpeechToText { [[nodiscard( "Registered non-void function")]] std::shared_ptr decode(std::span tokens, std::span encoderOutput) const; - // [[nodiscard("Registered non-void function")]] std::vector [[nodiscard("Registered non-void function")]] std::vector transcribe(std::span waveform, std::string languageOption) const; diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp index f62986b72..3137d274b 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp @@ -34,12 +34,14 @@ ProcessResult OnlineASRProcessor::processIter(const DecodingOptions &options) { chunkCompletedSegment(res); } + auto move_to_vector = [](auto& container) { + return std::vector(std::make_move_iterator(container.begin()), + std::make_move_iterator(container.end())); + }; + std::deque nonCommittedWords = this->hypothesisBuffer.complete(); - // return {this->toFlush(flushed), this->toFlush(nonCommittedWords)}; - return {std::vector(std::make_move_iterator(flushed.begin()), - std::make_move_iterator(flushed.end())), - std::vector(std::make_move_iterator(nonCommittedWords.begin()), - std::make_move_iterator(nonCommittedWords.end()))}; + + return { move_to_vector(flushed), move_to_vector(nonCommittedWords) }; } void OnlineASRProcessor::chunkCompletedSegment(std::span res) { @@ -86,22 +88,9 @@ std::vector OnlineASRProcessor::finish() { std::vector buffer(std::make_move_iterator(bufferDeq.begin()), std::make_move_iterator(bufferDeq.end())); - // std::string committedText = this->toFlush(buffer); this->bufferTimeOffset += static_cast(audioBuffer.size()) / OnlineASRProcessor::kSamplingRate; return buffer; } -// std::string OnlineASRProcessor::toFlush(const std::deque &words) const -// { -// std::string text; -// text.reserve(std::accumulate( -// words.cbegin(), words.cend(), 0, -// [](size_t sum, const Word &w) { return sum + w.content.size(); })); -// for (const auto &word : words) { -// text.append(word.content); -// } -// return text; -// } - } // namespace rnexecutorch::models::speech_to_text::stream diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h index 720e6bf76..3abaad3b6 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h @@ -12,7 +12,6 @@ class OnlineASRProcessor { void insertAudioChunk(std::span audio); types::ProcessResult processIter(const types::DecodingOptions &options); - // std::string finish(); std::vector finish(); std::vector audioBuffer; @@ -27,8 +26,6 @@ class OnlineASRProcessor { void chunkCompletedSegment(std::span res); void chunkAt(float time); - - // std::string toFlush(const std::deque &words) const; }; } // namespace rnexecutorch::models::speech_to_text::stream diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h index 685ba2b76..681495e2a 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h @@ -4,11 +4,6 @@ namespace rnexecutorch::models::speech_to_text::types { -// struct ProcessResult { -// std::string committed; -// std::string nonCommitted; -// }; - struct ProcessResult { std::vector committed; std::vector nonCommitted; diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts index 40801c7b3..2ff785490 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts @@ -21,7 +21,6 @@ export const useSpeechToText = ({ const [modelInstance] = useState(() => new SpeechToTextModule()); - // FIX 1: Allow state to be either string or Word[] const [committedTranscription, setCommittedTranscription] = useState< string | Word[] >(''); @@ -77,12 +76,6 @@ export const useSpeechToText = ({ const stream = useCallback( async (options?: DecodingOptions & { enableTimestamps?: boolean }) => { - console.log( - '[2] Hook: Stream called. Ready:', - isReady, - 'Generating:', - isGenerating - ); if (!isReady) throw new RnExecutorchError( RnExecutorchErrorCode.ModuleNotLoaded, @@ -96,7 +89,6 @@ export const useSpeechToText = ({ setIsGenerating(true); - // FIX 2: Reset based on the mode requested const enableTimestamps = options?.enableTimestamps ?? false; setCommittedTranscription(enableTimestamps ? [] : ''); setNonCommittedTranscription(enableTimestamps ? [] : ''); @@ -104,18 +96,13 @@ export const useSpeechToText = ({ let fullResult: string | Word[] = enableTimestamps ? [] : ''; try { - console.log('[3] Hook: Calling modelInstance.stream()'); - // @ts-ignore - Typescript struggles with the dual generator return type, but logic is safe for await (const { committed, nonCommitted } of modelInstance.stream( options )) { console.log(committed, nonCommitted); - // FIX 3: Dynamic Merging Logic if (typeof committed === 'string') { - // --- STRING MODE --- if (committed.length > 0) { setCommittedTranscription((prev) => { - // Safety check: if prev was somehow an array, reset it or cast to string const prevStr = typeof prev === 'string' ? prev : ''; return prevStr + committed; }); @@ -123,7 +110,6 @@ export const useSpeechToText = ({ } setNonCommittedTranscription(nonCommitted as string); } else { - // --- WORD[] MODE --- const committedWords = committed as Word[]; const nonCommittedWords = nonCommitted as Word[]; From dfea40e05da8c313d2e6dec379897e29dbf12bac Mon Sep 17 00:00:00 2001 From: Mateusz Sluszniak <56299341+msluszniak@users.noreply.github.com> Date: Wed, 21 Jan 2026 17:15:36 +0100 Subject: [PATCH 08/14] Apply suggestion from @msluszniak --- .../hooks/natural_language_processing/useSpeechToText.ts | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts index 2ff785490..2afca892c 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts @@ -49,7 +49,14 @@ export const useSpeechToText = ({ setError(parseUnknownError(err)); } })(); - }, [modelInstance, model, preventLoad]); + }, [ + modelInstance, + model.isMultilingual, + model.encoderSource, + model.decoderSource, + model.tokenizerSource, + preventLoad, + ]); const stateWrapper = useCallback( Promise>(fn: T) => From 41839d0c634ebee4a5affcdeaad59f9dfc1681b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Wed, 21 Jan 2026 17:31:07 +0100 Subject: [PATCH 09/14] Apply autofix lint changes --- apps/speech/screens/SpeechToTextScreen.tsx | 2 +- .../modules/natural_language_processing/SpeechToTextModule.ts | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index 4be72abf4..87ad8bd50 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -86,7 +86,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { if (enableTimestamps) { const result = await model.transcribe(audioBuffer, { - enableTimestamps: enableTimestamps + enableTimestamps: enableTimestamps, }); setTranscription(result); } diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts index 1ad2f6c97..61162078c 100644 --- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts +++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts @@ -135,9 +135,7 @@ export class SpeechToTextModule { options?: DecodingOptions & { enableTimestamps?: false | undefined } ): AsyncGenerator<{ committed: string; nonCommitted: string }>; - public async *stream( - options: DecodingOptions = {} - ): AsyncGenerator<{ + public async *stream(options: DecodingOptions = {}): AsyncGenerator<{ committed: string | Word[]; nonCommitted: string | Word[]; }> { From 2f916f8cbca0ae41e8778ebdc600efa9feb4da10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Wed, 21 Jan 2026 17:57:16 +0100 Subject: [PATCH 10/14] Fix linter issues --- apps/llm/app/voice_chat/index.tsx | 14 ++++++++++-- .../useSpeechToText.ts | 22 +++++++++++++------ .../SpeechToTextModule.ts | 6 ++++- 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/apps/llm/app/voice_chat/index.tsx b/apps/llm/app/voice_chat/index.tsx index 79a713c93..0bf4c9b30 100644 --- a/apps/llm/app/voice_chat/index.tsx +++ b/apps/llm/app/voice_chat/index.tsx @@ -76,7 +76,11 @@ function VoiceChatScreen() { }); recorder.start(); const transcription = await speechToText.stream(); - await llm.sendMessage(transcription); + await llm.sendMessage( + typeof transcription === 'string' + ? transcription + : transcription.map((w) => w.word).join(' ') + ); } }; @@ -105,7 +109,13 @@ function VoiceChatScreen() { ...llm.messageHistory, { role: 'user', - content: speechToText.committedTranscription, + content: + typeof speechToText.committedTranscription === + 'string' + ? speechToText.committedTranscription + : speechToText.committedTranscription + .map((w) => w.word) + .join(' '), }, ] : llm.messageHistory diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts index 2afca892c..da6549bec 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts @@ -103,11 +103,17 @@ export const useSpeechToText = ({ let fullResult: string | Word[] = enableTimestamps ? [] : ''; try { - for await (const { committed, nonCommitted } of modelInstance.stream( - options - )) { - console.log(committed, nonCommitted); + const streamGen = modelInstance.stream( + options as any + ) as AsyncGenerator<{ + committed: string | Word[]; + nonCommitted: string | Word[]; + }>; + + for await (const { committed, nonCommitted } of streamGen) { if (typeof committed === 'string') { + const nc = nonCommitted as unknown as string; + if (committed.length > 0) { setCommittedTranscription((prev) => { const prevStr = typeof prev === 'string' ? prev : ''; @@ -115,12 +121,12 @@ export const useSpeechToText = ({ }); (fullResult as string) += committed; } - setNonCommittedTranscription(nonCommitted as string); + setNonCommittedTranscription(nc); } else { const committedWords = committed as Word[]; const nonCommittedWords = nonCommitted as Word[]; - if (committedWords.length > 0) { + if (committedWords && committedWords.length > 0) { setCommittedTranscription((prev) => { const prevArr = Array.isArray(prev) ? prev : []; return [...prevArr, ...committedWords]; @@ -161,7 +167,9 @@ export const useSpeechToText = ({ nonCommittedTranscription, encode: stateWrapper(SpeechToTextModule.prototype.encode), decode: stateWrapper(SpeechToTextModule.prototype.decode), - transcribe: stateWrapper(SpeechToTextModule.prototype.transcribe), + transcribe: stateWrapper( + SpeechToTextModule.prototype.transcribe + ) as SpeechToTextModule['transcribe'], stream, streamStop: wrapper(SpeechToTextModule.prototype.streamStop), streamInsert: wrapper(SpeechToTextModule.prototype.streamInsert), diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts index 61162078c..98520a2e7 100644 --- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts +++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts @@ -95,11 +95,13 @@ export class SpeechToTextModule { options?: DecodingOptions & { enableTimestamps: true } ): Promise; + // eslint-disable-next-line no-dupe-class-members public async transcribe( waveform: Float32Array | number[], options?: DecodingOptions & { enableTimestamps?: false | undefined } ): Promise; + // eslint-disable-next-line no-dupe-class-members public async transcribe( waveform: Float32Array | number[], options: DecodingOptions = {} @@ -131,10 +133,12 @@ export class SpeechToTextModule { options: DecodingOptions & { enableTimestamps: true } ): AsyncGenerator<{ committed: Word[]; nonCommitted: Word[] }>; + // eslint-disable-next-line no-dupe-class-members public stream( options?: DecodingOptions & { enableTimestamps?: false | undefined } ): AsyncGenerator<{ committed: string; nonCommitted: string }>; + // eslint-disable-next-line no-dupe-class-members public async *stream(options: DecodingOptions = {}): AsyncGenerator<{ committed: string | Word[]; nonCommitted: string | Word[]; @@ -173,7 +177,7 @@ export class SpeechToTextModule { ), }); } catch (err) { - console.error('[Stream Decode Error]', err); + Logger.error('[Stream Decode Error]', err); } } else { queue.push({ committed, nonCommitted }); From 55f03be5ad90c4b867d6592ecd909a98865d7ff8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Wed, 21 Jan 2026 18:40:42 +0100 Subject: [PATCH 11/14] Revert changing error messages --- .../hooks/natural_language_processing/useSpeechToText.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts index da6549bec..c36e802db 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts @@ -64,12 +64,12 @@ export const useSpeechToText = ({ if (!isReady) throw new RnExecutorchError( RnExecutorchErrorCode.ModuleNotLoaded, - 'The model is currently not loaded.' + 'The model is currently not loaded. Please load the model before calling this function.' ); if (isGenerating) throw new RnExecutorchError( RnExecutorchErrorCode.ModelGenerating, - 'The model is currently generating.' + 'The model is currently generating. Please wait until previous model run is complete.' ); setIsGenerating(true); try { @@ -86,12 +86,12 @@ export const useSpeechToText = ({ if (!isReady) throw new RnExecutorchError( RnExecutorchErrorCode.ModuleNotLoaded, - 'Model not loaded' + 'The model is currently not loaded. Please load the model before calling this function.' ); if (isGenerating) throw new RnExecutorchError( RnExecutorchErrorCode.ModelGenerating, - 'Model is generating' + 'The model is currently generating. Please wait until previous model run is complete.' ); setIsGenerating(true); From 2fafd87ab41691edf0e902a9d35ebb2ae85b0ef4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Wed, 21 Jan 2026 18:42:25 +0100 Subject: [PATCH 12/14] Revert one more message --- .../src/hooks/natural_language_processing/useSpeechToText.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts index c36e802db..107053a47 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts @@ -150,7 +150,7 @@ export const useSpeechToText = ({ if (!isReady) throw new RnExecutorchError( RnExecutorchErrorCode.ModuleNotLoaded, - 'Model not loaded' + 'The model is currently not loaded. Please load the model before calling this function.' ); return fn.apply(modelInstance, args); }; From d394088b5b350c824d8e8fb33360740dd867ac87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Wed, 21 Jan 2026 19:42:44 +0100 Subject: [PATCH 13/14] Update docs --- .../useSpeechToText.md | 105 +++++++++++++----- .../SpeechToTextModule.md | 56 +++++++--- 2 files changed, 120 insertions(+), 41 deletions(-) diff --git a/docs/docs/02-hooks/01-natural-language-processing/useSpeechToText.md b/docs/docs/02-hooks/01-natural-language-processing/useSpeechToText.md index d94c96a66..b3171c77f 100644 --- a/docs/docs/02-hooks/01-natural-language-processing/useSpeechToText.md +++ b/docs/docs/02-hooks/01-natural-language-processing/useSpeechToText.md @@ -75,25 +75,31 @@ For more information on loading resources, take a look at [loading models](../.. ### Returns -| Field | Type | Description | -| --------------------------- | ---------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `transcribe` | `(waveform: Float32Array \| number[], options?: DecodingOptions \| undefined) => Promise` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. The second argument is an options object, e.g. `{ language: 'es' }` for multilingual models. Resolves a promise with the output transcription when the model is finished. Passing `number[]` is deprecated. | -| `stream` | `(options?: DecodingOptions \| undefined) => Promise` | Starts a streaming transcription process. Use in combination with `streamInsert` to feed audio chunks and `streamStop` to end the stream. The argument is an options object, e.g. `{ language: 'es' }` for multilingual models. Updates `committedTranscription` and `nonCommittedTranscription` as transcription progresses. | -| `streamInsert` | `(waveform: Float32Array \| number[]) => void` | Inserts a chunk of audio data (sampled at 16kHz) into the ongoing streaming transcription. Call this repeatedly as new audio data becomes available. Passing `number[]` is deprecated. | -| `streamStop` | `() => void` | Stops the ongoing streaming transcription process. | -| `encode` | `(waveform: Float32Array \| number[]) => Promise` | Runs the encoding part of the model on the provided waveform. Passing `number[]` is deprecated. | -| `decode` | `(tokens: number[] \| Int32Array, encoderOutput: Float32Array \| number[]) => Promise` | Runs the decoder of the model. Passing `number[]` is deprecated. | -| `committedTranscription` | `string` | Contains the part of the transcription that is finalized and will not change. Useful for displaying stable results during streaming. | -| `nonCommittedTranscription` | `string` | Contains the part of the transcription that is still being processed and may change. Useful for displaying live, partial results during streaming. | -| `error` | `string \| null` | Contains the error message if the model failed to load. | -| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | -| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | -| `downloadProgress` | `number` | Tracks the progress of the model download process. | +| Field | Type | Description | +| --------------------------- | ---------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `transcribe` | `(waveform: Float32Array | number[], options?: DecodingOptions & { enableTimestamps?: boolean }) => Promise` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. For multilingual models, specify the language in `options`, e.g. `{ language: 'es' }` for multilingual models. If `enableTimestamps` is true, returns transcription with timestamps (`Word[]>`). If `enableTimestamps` is false (default), returns transcription as a string. Resolves a promise with the output transcription when the model is finished. Passing `number[]` is deprecated. | +| `stream` | `(options?: DecodingOptions & { enableTimestamps?: boolean }) => Promise` | Starts a streaming transcription process. Use in combination with `streamInsert` to feed audio chunks and `streamStop` to end the stream. The argument is an options object, e.g. `{ language: 'es' }` for multilingual models. As in `transcribe`, you can decide either you want transcription with timestamps or not by setting `enableTimestamps`. Updates `committedTranscription` and `nonCommittedTranscription` as transcription progresses. | +| `streamInsert` | `(waveform: Float32Array \| number[]) => void` | Inserts a chunk of audio data (sampled at 16kHz) into the ongoing streaming transcription. Call this repeatedly as new audio data becomes available. Passing `number[]` is deprecated. | +| `streamStop` | `() => void` | Stops the ongoing streaming transcription process. | +| `encode` | `(waveform: Float32Array \| number[]) => Promise` | Runs the encoding part of the model on the provided waveform. Passing `number[]` is deprecated. | +| `decode` | `(tokens: number[] \| Int32Array, encoderOutput: Float32Array \| number[]) => Promise` | Runs the decoder of the model. Passing `number[]` is deprecated. | +| `committedTranscription` | `string` | Contains the part of the transcription that is finalized and will not change. Useful for displaying stable results during streaming. | +| `nonCommittedTranscription` | `string` | Contains the part of the transcription that is still being processed and may change. Useful for displaying live, partial results during streaming. | +| `error` | `string \| null` | Contains the error message if the model failed to load. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| `downloadProgress` | `number` | Tracks the progress of the model download process. |
Type definitions ```typescript +interface Word { + word: string; + start: number; + end: number; +} + // Languages supported by whisper (Multilingual) type SpeechToTextLanguage = | 'af' @@ -174,6 +180,7 @@ type SpeechToTextLanguage = interface DecodingOptions { language?: SpeechToTextLanguage; + enableTimestamps?: boolean; } interface SpeechToTextModelConfig { @@ -204,12 +211,25 @@ const model = useSpeechToText({ const transcription = await model.transcribe(spanishAudio, { language: 'es' }); ``` +### Timestamps + +You can obtain word-level timestamps by setting `enableTimestamps: true` in the options. This changes the return type from a string to an array of `Word` objects. + +```typescript +const words = await model.transcribe(audioBuffer, { enableTimestamps: true }); +// words: [{ word: "Hello", start: 0.0, end: 0.4 }, ...] +``` + ## Example ```tsx import React, { useState } from 'react'; -import { Button, Text } from 'react-native'; -import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch'; +import { Button, Text, View } from 'react-native'; +import { + useSpeechToText, + WHISPER_TINY_EN, + Word, +} from 'react-native-executorch'; import { AudioContext } from 'react-native-audio-api'; import * as FileSystem from 'expo-file-system'; @@ -218,7 +238,7 @@ function App() { model: WHISPER_TINY_EN, }); - const [transcription, setTranscription] = useState(''); + const [transcription, setTranscription] = useState(''); const loadAudio = async () => { const { uri } = await FileSystem.downloadAsync( @@ -235,14 +255,38 @@ function App() { const handleTranscribe = async () => { const audio = await loadAudio(); - await model.transcribe(audio); + // Default text transcription + const result = await model.transcribe(audio); + setTranscription(result); + }; + + const handleTranscribeWithTimestamps = async () => { + const audio = await loadAudio(); + // Transcription with timestamps + const result = await model.transcribe(audio, { enableTimestamps: true }); + setTranscription(result); + }; + + const renderContent = () => { + if (typeof transcription === 'string') { + return {transcription}; + } + return transcription.map((w, i) => ( + + {w.word} ({w.start.toFixed(2)}s) + + )); }; return ( - <> - {transcription} -