From 0ed673071ceeb51ee8994c78b049e43c2aecbe96 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Tue, 20 Jan 2026 14:16:54 +0100
Subject: [PATCH 01/14] Draft of changes introducing timestamping

---
 .../host_objects/JsiConversions.h             |  20 ++
 .../models/speech_to_text/SpeechToText.cpp    | 127 ++++++++--
 .../models/speech_to_text/SpeechToText.h      |   3 +-
 .../stream/OnlineASRProcessor.cpp             |  32 +--
 .../stream/OnlineASRProcessor.h               |   5 +-
 .../useSpeechToText.ts                        | 177 +++++++++++++-
 .../SpeechToTextModule.ts                     | 228 ++++++++++++++++--
 7 files changed, 535 insertions(+), 57 deletions(-)
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index 97e8d91fb..ec6332209 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -62,6 +62,20 @@ getValue<std::shared_ptr<jsi::Function>>(const jsi::Value &val,
       val.asObject(runtime).asFunction(runtime));
 }
 
+template <>
+inline getValue<Word>(const jsi::Value &val, jsi::Runtime &runtime) {
+  jsi::Array jsiArr(rt, words.size());
+  for (size_t i = 0; i < words.size(); ++i) {
+    jsi::Object obj(rt);
+    obj.setProperty(rt, "word",
+                    jsi::String::createFromUtf8(rt, words[i].content));
+    obj.setProperty(rt, "start", static_cast<double>(words[i].start));
+    obj.setProperty(rt, "end", static_cast<double>(words[i].end));
+    jsiArr.setValueAtIndex(rt, i, obj);
+  }
+  return jsiArr;
+};
+
 template <>
 inline JSTensorViewIn getValue<JSTensorViewIn>(const jsi::Value &val,
                                                jsi::Runtime &runtime) {
@@ -218,6 +232,12 @@ getValue<std::vector<int64_t>>(const jsi::Value &val, jsi::Runtime &runtime) {
   return getArrayAsVector<int64_t>(val, runtime);
 }
 
+template <>
+inline std::vector<Word> getValue<std::vector<Word>>(const jsi::Value &val,
+                                                     jsi::Runtime &runtime) {
+  return getArrayAsVector<Word>(val, runtime);
+}
+
 // Template specializations for std::span<T> types
 template <>
 inline std::span<float> getValue<std::span<float>>(const jsi::Value &val,
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
index 3c81eb8e9..7f8ef81cf 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
@@ -66,11 +66,92 @@ std::vector<char> SpeechToText::transcribe(std::span<float> waveform,
   return {transcription.begin(), transcription.end()};
 }
 
+std::vector<Word> SpeechToText::transcribe(std::span<float> waveform,
+                                           std::string languageOption) const {
+  std::vector<Segment> segments =
+      this->asr->transcribe(waveform, DecodingOptions(languageOption));
+  std::vector<Word> transcription;
+
+  size_t transcriptionLength = 0;
+  for (auto &segment : segments) {
+    transcriptionLength += segment.words.size();
+  }
+
+  transcription.reserve(segments.size());
+
+  for (auto &segment : segments) {
+    for (auto &word : segment.words) {
+      transcription.push_back(word);
+    }
+  }
+
+  auto wordsToJsi = [](jsi::Runtime &rt,
+                       const std::vector<Word> &words) -> jsi::Value {
+    jsi::Array jsiArr(rt, words.size());
+    for (size_t i = 0; i < words.size(); ++i) {
+      jsi::Object obj(rt);
+      obj.setProperty(rt, "word",
+                      jsi::String::createFromUtf8(rt, words[i].content));
+      obj.setProperty(rt, "start", static_cast<double>(words[i].start));
+      obj.setProperty(rt, "end", static_cast<double>(words[i].end));
+      jsiArr.setValueAtIndex(rt, i, obj);
+    }
+    return jsiArr;
+  };
+
+  return transcription;
+}
+
 size_t SpeechToText::getMemoryLowerBound() const noexcept {
   return this->encoder->getMemoryLowerBound() +
          this->decoder->getMemoryLowerBound();
 }
 
+// void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
+//                           std::string languageOption) {
+//   if (this->isStreaming) {
+//     throw RnExecutorchError(RnExecutorchErrorCode::StreamingInProgress,
+//                             "Streaming is already in progress!");
+//   }
+
+//   auto nativeCallback =
+//       [this, callback](const std::vector<char> &committedVec,
+//                        const std::vector<char> &nonCommittedVec, bool isDone)
+//                        {
+//         this->callInvoker->invokeAsync([callback, committedVec,
+//         nonCommittedVec,
+//                                         isDone](jsi::Runtime &rt) {
+//           callback->call(
+//               rt, rnexecutorch::jsi_conversion::getJsiValue(committedVec,
+//               rt), rnexecutorch::jsi_conversion::getJsiValue(nonCommittedVec,
+//               rt), jsi::Value(isDone));
+//         });
+//       };
+
+//   this->isStreaming = true;
+//   while (this->isStreaming) {
+//     if (!this->readyToProcess ||
+//         this->processor->audioBuffer.size() < SpeechToText::kMinAudioSamples)
+//         {
+//       std::this_thread::sleep_for(std::chrono::milliseconds(100));
+//       continue;
+//     }
+//     ProcessResult res =
+//         this->processor->processIter(DecodingOptions(languageOption));
+
+//     nativeCallback({res.committed.begin(), res.committed.end()},
+//                    {res.nonCommitted.begin(), res.nonCommitted.end()},
+//                    false);
+//     this->readyToProcess = false;
+//   }
+
+//   std::string committed = this->processor->finish();
+
+//   nativeCallback({committed.begin(), committed.end()}, {}, true);
+
+//   this->resetStreamState();
+// }
+
 void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
                           std::string languageOption) {
   if (this->isStreaming) {
@@ -78,17 +159,33 @@ void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
                             "Streaming is already in progress!");
   }
 
-  auto nativeCallback =
-      [this, callback](const std::vector<char> &committedVec,
-                       const std::vector<char> &nonCommittedVec, bool isDone) {
-        this->callInvoker->invokeAsync([callback, committedVec, nonCommittedVec,
-                                        isDone](jsi::Runtime &rt) {
-          callback->call(
-              rt, rnexecutorch::jsi_conversion::getJsiValue(committedVec, rt),
-              rnexecutorch::jsi_conversion::getJsiValue(nonCommittedVec, rt),
-              jsi::Value(isDone));
-        });
-      };
+  auto wordsToJsi = [](jsi::Runtime &rt,
+                       const std::vector<Word> &words) -> jsi::Value {
+    jsi::Array jsiArr(rt, words.size());
+    for (size_t i = 0; i < words.size(); ++i) {
+      jsi::Object obj(rt);
+      obj.setProperty(rt, "word",
+                      jsi::String::createFromUtf8(rt, words[i].content));
+      obj.setProperty(rt, "start", static_cast<double>(words[i].start));
+      obj.setProperty(rt, "end", static_cast<double>(words[i].end));
+      jsiArr.setValueAtIndex(rt, i, obj);
+    }
+    return jsiArr;
+  };
+
+  auto nativeCallback = [this, callback,
+                         wordsToJsi](const std::vector<Word> &committedVec,
+                                     const std::vector<Word> &nonCommittedVec,
+                                     bool isDone) {
+    this->callInvoker->invokeAsync([callback, committedVec, nonCommittedVec,
+                                    isDone, wordsToJsi](jsi::Runtime &rt) {
+      jsi::Value committedJsi = wordsToJsi(rt, committedVec);
+      jsi::Value nonCommittedJsi = wordsToJsi(rt, nonCommittedVec);
+
+      callback->call(rt, std::move(committedJsi), std::move(nonCommittedJsi),
+                     jsi::Value(isDone));
+    });
+  };
 
   this->isStreaming = true;
   while (this->isStreaming) {
@@ -100,14 +197,14 @@ void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
     ProcessResult res =
         this->processor->processIter(DecodingOptions(languageOption));
 
-    nativeCallback({res.committed.begin(), res.committed.end()},
-                   {res.nonCommitted.begin(), res.nonCommitted.end()}, false);
+    nativeCallback(res.committed, res.nonCommitted, false);
     this->readyToProcess = false;
   }
 
-  std::string committed = this->processor->finish();
+  // finish() now returns std::vector<Word>
+  std::vector<Word> committed = this->processor->finish();
 
-  nativeCallback({committed.begin(), committed.end()}, {}, true);
+  nativeCallback(committed, {}, true);
 
   this->resetStreamState();
 }
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
index d2111d378..af02a5357 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
@@ -23,7 +23,8 @@ class SpeechToText {
   [[nodiscard(
       "Registered non-void function")]] std::shared_ptr<OwningArrayBuffer>
   decode(std::span<int32_t> tokens, std::span<float> encoderOutput) const;
-  [[nodiscard("Registered non-void function")]] std::vector<char>
+  // [[nodiscard("Registered non-void function")]] std::vector<char>
+  [[nodiscard("Registered non-void function")]] std::vector<Word>
   transcribe(std::span<float> waveform, std::string languageOption) const;
 
   size_t getMemoryLowerBound() const noexcept;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp
index c6a99e9a2..b8a7aced4 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp
@@ -77,23 +77,27 @@ void OnlineASRProcessor::chunkAt(float time) {
   this->bufferTimeOffset = time;
 }
 
-std::string OnlineASRProcessor::finish() {
-  const std::deque<Word> buffer = this->hypothesisBuffer.complete();
-  std::string committedText = this->toFlush(buffer);
+std::vector<Word> OnlineASRProcessor::finish() {
+  std::deque<Word> bufferDeq = this->hypothesisBuffer.complete();
+  std::vector<Word> buffer(std::make_move_iterator(bufferDeq.begin()),
+                           std::make_move_iterator(bufferDeq.end()));
+
+  // std::string committedText = this->toFlush(buffer);
   this->bufferTimeOffset += static_cast<float>(audioBuffer.size()) /
                             OnlineASRProcessor::kSamplingRate;
-  return committedText;
+  return buffer;
 }
 
-std::string OnlineASRProcessor::toFlush(const std::deque<Word> &words) const {
-  std::string text;
-  text.reserve(std::accumulate(
-      words.cbegin(), words.cend(), 0,
-      [](size_t sum, const Word &w) { return sum + w.content.size(); }));
-  for (const auto &word : words) {
-    text.append(word.content);
-  }
-  return text;
-}
+// std::string OnlineASRProcessor::toFlush(const std::deque<Word> &words) const
+// {
+//   std::string text;
+//   text.reserve(std::accumulate(
+//       words.cbegin(), words.cend(), 0,
+//       [](size_t sum, const Word &w) { return sum + w.content.size(); }));
+//   for (const auto &word : words) {
+//     text.append(word.content);
+//   }
+//   return text;
+// }
 
 } // namespace rnexecutorch::models::speech_to_text::stream
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h
index c50b56271..720e6bf76 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h
@@ -12,7 +12,8 @@ class OnlineASRProcessor {
 
   void insertAudioChunk(std::span<const float> audio);
   types::ProcessResult processIter(const types::DecodingOptions &options);
-  std::string finish();
+  // std::string finish();
+  std::vector<Word> finish();
 
   std::vector<float> audioBuffer;
 
@@ -27,7 +28,7 @@ class OnlineASRProcessor {
   void chunkCompletedSegment(std::span<const types::Segment> res);
   void chunkAt(float time);
 
-  std::string toFlush(const std::deque<types::Word> &words) const;
+  // std::string toFlush(const std::deque<types::Word> &words) const;
 };
 
 } // namespace rnexecutorch::models::speech_to_text::stream
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
index 3e1324f54..6f22bf7b5 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -1,5 +1,147 @@
+// import { useEffect, useCallback, useState } from 'react';
+// import { SpeechToTextModule, Word } from '../../modules/natural_language_processing/SpeechToTextModule';
+// import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt';
+// import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
+// import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
+
+// export const useSpeechToText = ({
+//   model,
+//   preventLoad = false,
+// }: {
+//   model: SpeechToTextModelConfig;
+//   preventLoad?: boolean;
+// }) => {
+//   const [error, setError] = useState<null | RnExecutorchError>(null);
+//   const [isReady, setIsReady] = useState(false);
+//   const [isGenerating, setIsGenerating] = useState(false);
+//   const [downloadProgress, setDownloadProgress] = useState(0);
+
+//   const [modelInstance] = useState(() => new SpeechToTextModule());
+//   const [committedTranscription, setCommittedTranscription] = useState(Word);
+//   const [nonCommittedTranscription, setNonCommittedTranscription] =
+//     useState(Word);
+
+//   useEffect(() => {
+//     if (preventLoad) return;
+//     (async () => {
+//       setDownloadProgress(0);
+//       setError(null);
+//       try {
+//         setIsReady(false);
+//         await modelInstance.load(
+//           {
+//             isMultilingual: model.isMultilingual,
+//             encoderSource: model.encoderSource,
+//             decoderSource: model.decoderSource,
+//             tokenizerSource: model.tokenizerSource,
+//           },
+//           setDownloadProgress
+//         );
+//         setIsReady(true);
+//       } catch (err) {
+//         setError(parseUnknownError(err));
+//       }
+//     })();
+//   }, [
+//     modelInstance,
+//     model.isMultilingual,
+//     model.encoderSource,
+//     model.decoderSource,
+//     model.tokenizerSource,
+//     preventLoad,
+//   ]);
+
+//   const stateWrapper = useCallback(
+//     <T extends (...args: any[]) => Promise<any>>(fn: T) =>
+//       async (...args: Parameters<T>): Promise<Awaited<ReturnType<T>>> => {
+//         if (!isReady)
+//           throw new RnExecutorchError(
+//             RnExecutorchErrorCode.ModuleNotLoaded,
+//             'The model is currently not loaded. Please load the model before calling this function.'
+//           );
+//         if (isGenerating)
+//           throw new RnExecutorchError(
+//             RnExecutorchErrorCode.ModelGenerating,
+//             'The model is currently generating. Please wait until previous model run is complete.'
+//           );
+//         setIsGenerating(true);
+//         try {
+//           return await fn.apply(modelInstance, args);
+//         } finally {
+//           setIsGenerating(false);
+//         }
+//       },
+//     [isReady, isGenerating, modelInstance]
+//   );
+
+//   const stream = useCallback(
+//     async (options?: DecodingOptions) => {
+//       if (!isReady)
+//         throw new RnExecutorchError(
+//           RnExecutorchErrorCode.ModuleNotLoaded,
+//           'The model is currently not loaded. Please load the model before calling this function.'
+//         );
+//       if (isGenerating)
+//         throw new RnExecutorchError(
+//           RnExecutorchErrorCode.ModelGenerating,
+//           'The model is currently generating. Please wait until previous model run is complete.'
+//         );
+//       setIsGenerating(true);
+//       setCommittedTranscription('');
+//       setNonCommittedTranscription('');
+//       let transcription = '';
+//       try {
+//         for await (const { committed, nonCommitted } of modelInstance.stream(
+//           options
+//         )) {
+//           setCommittedTranscription((prev) => prev + committed);
+//           setNonCommittedTranscription(nonCommitted);
+//           transcription += committed;
+//         }
+//       } finally {
+//         setIsGenerating(false);
+//       }
+//       return transcription;
+//     },
+//     [isReady, isGenerating, modelInstance]
+//   );
+
+//   const wrapper = useCallback(
+//     <T extends (...args: any[]) => any>(fn: T) => {
+//       return (...args: Parameters<T>): ReturnType<T> => {
+//         if (!isReady)
+//           throw new RnExecutorchError(
+//             RnExecutorchErrorCode.ModuleNotLoaded,
+//             'The model is currently not loaded. Please load the model before calling this function.'
+//           );
+//         return fn.apply(modelInstance, args);
+//       };
+//     },
+//     [isReady, modelInstance]
+//   );
+
+//   return {
+//     error,
+//     isReady,
+//     isGenerating,
+//     downloadProgress,
+//     committedTranscription,
+//     nonCommittedTranscription,
+//     encode: stateWrapper(SpeechToTextModule.prototype.encode),
+//     decode: stateWrapper(SpeechToTextModule.prototype.decode),
+//     transcribe: stateWrapper(SpeechToTextModule.prototype.transcribe),
+//     stream,
+//     streamStop: wrapper(SpeechToTextModule.prototype.streamStop),
+//     streamInsert: wrapper(SpeechToTextModule.prototype.streamInsert),
+//   };
+// };
+
 import { useEffect, useCallback, useState } from 'react';
-import { SpeechToTextModule } from '../../modules/natural_language_processing/SpeechToTextModule';
+// Make sure Word is exported from your module file
+import {
+  SpeechToTextModule,
+  Word,
+} from '../../modules/natural_language_processing/SpeechToTextModule';
 import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt';
 import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
 import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
@@ -17,9 +159,14 @@ export const useSpeechToText = ({
   const [downloadProgress, setDownloadProgress] = useState(0);
 
   const [modelInstance] = useState(() => new SpeechToTextModule());
-  const [committedTranscription, setCommittedTranscription] = useState('');
-  const [nonCommittedTranscription, setNonCommittedTranscription] =
-    useState('');
+
+  // FIX 1: Initialize with empty array [], generic type Word[]
+  const [committedTranscription, setCommittedTranscription] = useState<Word[]>(
+    []
+  );
+  const [nonCommittedTranscription, setNonCommittedTranscription] = useState<
+    Word[]
+  >([]);
 
   useEffect(() => {
     if (preventLoad) return;
@@ -87,21 +234,31 @@ export const useSpeechToText = ({
           'The model is currently generating. Please wait until previous model run is complete.'
         );
       setIsGenerating(true);
-      setCommittedTranscription('');
-      setNonCommittedTranscription('');
-      let transcription = '';
+
+      // FIX 2: Reset to empty arrays
+      setCommittedTranscription([]);
+      setNonCommittedTranscription([]);
+
+      // Accumulator is now an array of Words, not a string
+      const fullResult: Word[] = [];
+
       try {
         for await (const { committed, nonCommitted } of modelInstance.stream(
           options
         )) {
-          setCommittedTranscription((prev) => prev + committed);
+          // FIX 3: Update state by appending arrays
+          if (committed.length > 0) {
+            setCommittedTranscription((prev) => [...prev, ...committed]);
+            fullResult.push(...committed);
+          }
+
+          // nonCommitted is always a fresh partial chunk
           setNonCommittedTranscription(nonCommitted);
-          transcription += committed;
         }
       } finally {
         setIsGenerating(false);
       }
-      return transcription;
+      return fullResult;
     },
     [isReady, isGenerating, modelInstance]
   );
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
index 9619547c8..e0ca88251 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
@@ -1,18 +1,212 @@
+// import { Logger } from '../../common/Logger';
+// import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt';
+// import { ResourceFetcher } from '../../utils/ResourceFetcher';
+// import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
+// import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
+
+// export class SpeechToTextModule {
+//   private nativeModule: any;
+
+//   private modelConfig!: SpeechToTextModelConfig;
+
+//   private textDecoder = new TextDecoder('utf-8', {
+//     fatal: false,
+//     ignoreBOM: true,
+//   });
+
+//   public async load(
+//     model: SpeechToTextModelConfig,
+//     onDownloadProgressCallback: (progress: number) => void = () => {}
+//   ) {
+//     this.modelConfig = model;
+
+//     const tokenizerLoadPromise = ResourceFetcher.fetch(
+//       undefined,
+//       model.tokenizerSource
+//     );
+//     const encoderDecoderPromise = ResourceFetcher.fetch(
+//       onDownloadProgressCallback,
+//       model.encoderSource,
+//       model.decoderSource
+//     );
+//     const [tokenizerSources, encoderDecoderResults] = await Promise.all([
+//       tokenizerLoadPromise,
+//       encoderDecoderPromise,
+//     ]);
+//     const encoderSource = encoderDecoderResults?.[0];
+//     const decoderSource = encoderDecoderResults?.[1];
+//     if (!encoderSource || !decoderSource || !tokenizerSources) {
+//       throw new RnExecutorchError(
+//         RnExecutorchErrorCode.DownloadInterrupted,
+//         'The download has been interrupted. As a result, not every file was downloaded. Please retry the download.'
+//       );
+//     }
+//     this.nativeModule = await global.loadSpeechToText(
+//       encoderSource,
+//       decoderSource,
+//       tokenizerSources[0]!
+//     );
+//   }
+
+//   public delete(): void {
+//     this.nativeModule.unload();
+//   }
+
+//   public async encode(
+//     waveform: Float32Array | number[]
+//   ): Promise<Float32Array> {
+//     if (Array.isArray(waveform)) {
+//       Logger.info(
+//         'Passing waveform as number[] is deprecated, use Float32Array instead'
+//       );
+//       waveform = new Float32Array(waveform);
+//     }
+//     return new Float32Array(await this.nativeModule.encode(waveform));
+//   }
+
+//   public async decode(
+//     tokens: Int32Array | number[],
+//     encoderOutput: Float32Array | number[]
+//   ): Promise<Float32Array> {
+//     if (Array.isArray(tokens)) {
+//       Logger.info(
+//         'Passing tokens as number[] is deprecated, use Int32Array instead'
+//       );
+//       tokens = new Int32Array(tokens);
+//     }
+//     if (Array.isArray(encoderOutput)) {
+//       Logger.info(
+//         'Passing encoderOutput as number[] is deprecated, use Float32Array instead'
+//       );
+//       encoderOutput = new Float32Array(encoderOutput);
+//     }
+//     return new Float32Array(
+//       await this.nativeModule.decode(tokens, encoderOutput)
+//     );
+//   }
+
+//   public async transcribe(
+//     waveform: Float32Array | number[],
+//     options: DecodingOptions = {}
+//   ): Promise<string> {
+//     this.validateOptions(options);
+
+//     if (Array.isArray(waveform)) {
+//       Logger.info(
+//         'Passing waveform as number[] is deprecated, use Float32Array instead'
+//       );
+//       waveform = new Float32Array(waveform);
+//     }
+//     const transcriptionBytes = await this.nativeModule.transcribe(
+//       waveform,
+//       options.language || ''
+//     );
+//     return this.textDecoder.decode(new Uint8Array(transcriptionBytes));
+//   }
+
+//   public async *stream(
+//     options: DecodingOptions = {}
+//   ): AsyncGenerator<{ committed: string; nonCommitted: string }> {
+//     this.validateOptions(options);
+
+//     const queue: { committed: string; nonCommitted: string }[] = [];
+//     let waiter: (() => void) | null = null;
+//     let finished = false;
+//     let error: unknown;
+
+//     const wake = () => {
+//       waiter?.();
+//       waiter = null;
+//     };
+
+//     (async () => {
+//       try {
+//         await this.nativeModule.stream(
+//           (committed: number[], nonCommitted: number[], isDone: boolean) => {
+//             queue.push({
+//               committed: this.textDecoder.decode(new Uint8Array(committed)),
+//               nonCommitted: this.textDecoder.decode(
+//                 new Uint8Array(nonCommitted)
+//               ),
+//             });
+//             if (isDone) {
+//               finished = true;
+//             }
+//             wake();
+//           },
+//           options.language || ''
+//         );
+//         finished = true;
+//         wake();
+//       } catch (e) {
+//         error = e;
+//         finished = true;
+//         wake();
+//       }
+//     })();
+
+//     while (true) {
+//       if (queue.length > 0) {
+//         yield queue.shift()!;
+//         if (finished && queue.length === 0) {
+//           return;
+//         }
+//         continue;
+//       }
+//       if (error) throw parseUnknownError(error);
+//       if (finished) return;
+//       await new Promise<void>((r) => (waiter = r));
+//     }
+//   }
+
+//   public streamInsert(waveform: Float32Array | number[]): void {
+//     if (Array.isArray(waveform)) {
+//       Logger.info(
+//         'Passing waveform as number[] is deprecated, use Float32Array instead'
+//       );
+//       waveform = new Float32Array(waveform);
+//     }
+//     this.nativeModule.streamInsert(waveform);
+//   }
+
+//   public streamStop(): void {
+//     this.nativeModule.streamStop();
+//   }
+
+//   private validateOptions(options: DecodingOptions) {
+//     if (!this.modelConfig.isMultilingual && options.language) {
+//       throw new RnExecutorchError(
+//         RnExecutorchErrorCode.InvalidConfig,
+//         'Model is not multilingual, cannot set language'
+//       );
+//     }
+//     if (this.modelConfig.isMultilingual && !options.language) {
+//       throw new RnExecutorchError(
+//         RnExecutorchErrorCode.InvalidConfig,
+//         'Model is multilingual, provide a language'
+//       );
+//     }
+//   }
+// }
+
 import { Logger } from '../../common/Logger';
 import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt';
 import { ResourceFetcher } from '../../utils/ResourceFetcher';
 import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
 import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
 
+// 1. Define the Word interface matching your C++ JSI object structure
+export interface Word {
+  word: string;
+  start: number;
+  end: number;
+}
+
 export class SpeechToTextModule {
   private nativeModule: any;
-
   private modelConfig!: SpeechToTextModelConfig;
 
-  private textDecoder = new TextDecoder('utf-8', {
-    fatal: false,
-    ignoreBOM: true,
-  });
+  // 2. TextDecoder is removed as C++ now returns JS objects directly
 
   public async load(
     model: SpeechToTextModelConfig,
@@ -85,10 +279,11 @@ export class SpeechToTextModule {
     );
   }
 
+  // 3. Update transcribe to return Word[] instead of string
   public async transcribe(
     waveform: Float32Array | number[],
     options: DecodingOptions = {}
-  ): Promise<string> {
+  ): Promise<Word[]> {
     this.validateOptions(options);
 
     if (Array.isArray(waveform)) {
@@ -97,19 +292,23 @@ export class SpeechToTextModule {
       );
       waveform = new Float32Array(waveform);
     }
-    const transcriptionBytes = await this.nativeModule.transcribe(
+
+    // The native module now returns an Array of Objects, not bytes
+    const transcription: Word[] = await this.nativeModule.transcribe(
       waveform,
       options.language || ''
     );
-    return this.textDecoder.decode(new Uint8Array(transcriptionBytes));
+
+    return transcription;
   }
 
+  // 4. Update stream to yield Word[] structure
   public async *stream(
     options: DecodingOptions = {}
-  ): AsyncGenerator<{ committed: string; nonCommitted: string }> {
+  ): AsyncGenerator<{ committed: Word[]; nonCommitted: Word[] }> {
     this.validateOptions(options);
 
-    const queue: { committed: string; nonCommitted: string }[] = [];
+    const queue: { committed: Word[]; nonCommitted: Word[] }[] = [];
     let waiter: (() => void) | null = null;
     let finished = false;
     let error: unknown;
@@ -122,12 +321,11 @@ export class SpeechToTextModule {
     (async () => {
       try {
         await this.nativeModule.stream(
-          (committed: number[], nonCommitted: number[], isDone: boolean) => {
+          // Callback now receives arrays of objects directly
+          (committed: Word[], nonCommitted: Word[], isDone: boolean) => {
             queue.push({
-              committed: this.textDecoder.decode(new Uint8Array(committed)),
-              nonCommitted: this.textDecoder.decode(
-                new Uint8Array(nonCommitted)
-              ),
+              committed,
+              nonCommitted,
             });
             if (isDone) {
               finished = true;

From b75204e25d6ba1864518e0d221ce99ddda86d6ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Tue, 20 Jan 2026 14:44:24 +0100
Subject: [PATCH 02/14] Add missing headers

---
 .../common/rnexecutorch/host_objects/JsiConversions.h            | 1 +
 .../common/rnexecutorch/host_objects/ModelHostObject.h           | 1 +
 2 files changed, 2 insertions(+)

diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index ec6332209..f43d14180 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -18,6 +18,7 @@
 #include <rnexecutorch/models/object_detection/Constants.h>
 #include <rnexecutorch/models/object_detection/Types.h>
 #include <rnexecutorch/models/ocr/Types.h>
+#include <rnexecutorch/models/speech_to_text/types/Word.h>
 #include <rnexecutorch/models/voice_activity_detection/Types.h>
 
 namespace rnexecutorch::jsi_conversion {
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
index c8232fe8c..fc96965ec 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -20,6 +20,7 @@
 #include <rnexecutorch/models/llm/LLM.h>
 #include <rnexecutorch/models/ocr/OCR.h>
 #include <rnexecutorch/models/speech_to_text/SpeechToText.h>
+#include <rnexecutorch/models/speech_to_text/types/Word.h>
 #include <rnexecutorch/models/text_to_image/TextToImage.h>
 #include <rnexecutorch/models/text_to_speech/TextToSpeech.h>
 #include <rnexecutorch/models/vertical_ocr/VerticalOCR.h>

From 3c72c17341a6173142324bd2e19b7cd50e225c05 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Tue, 20 Jan 2026 21:59:21 +0100
Subject: [PATCH 03/14] Add draft of working version for timestamps only

---
 apps/speech/screens/SpeechToTextScreen.tsx    | 335 +++++++++++++++++-
 .../host_objects/JsiConversions.h             |  49 ++-
 .../host_objects/ModelHostObject.h            |   2 +
 .../models/speech_to_text/SpeechToText.cpp    |  40 +--
 .../stream/OnlineASRProcessor.cpp             |   6 +-
 .../speech_to_text/types/ProcessResult.h      |   9 +-
 6 files changed, 397 insertions(+), 44 deletions(-)

diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index 1e4525986..ad78dcb49 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -1,3 +1,300 @@
+// import React, { useEffect, useRef, useState } from 'react';
+// import {
+//   Text,
+//   View,
+//   StyleSheet,
+//   TouchableOpacity,
+//   ScrollView,
+//   TextInput,
+//   KeyboardAvoidingView,
+//   Platform,
+// } from 'react-native';
+// import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
+// import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch';
+// import FontAwesome from '@expo/vector-icons/FontAwesome';
+// import {
+//   AudioManager,
+//   AudioRecorder,
+//   AudioContext,
+// } from 'react-native-audio-api';
+// import * as FileSystem from 'expo-file-system/legacy';
+// import SWMIcon from '../assets/swm_icon.svg';
+// import DeviceInfo from 'react-native-device-info';
+
+// const isSimulator = DeviceInfo.isEmulatorSync();
+
+// export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
+//   const model = useSpeechToText({
+//     model: WHISPER_TINY_EN,
+//   });
+
+//   const [transcription, setTranscription] = useState('');
+//   const [audioURL, setAudioURL] = useState('');
+//   const [liveTranscribing, setLiveTranscribing] = useState(false);
+//   const scrollViewRef = useRef<ScrollView>(null);
+
+//   const [recorder] = useState(
+//     () =>
+//       new AudioRecorder({
+//         sampleRate: 16000,
+//         bufferLengthInSamples: 1600,
+//       })
+//   );
+
+//   useEffect(() => {
+//     AudioManager.setAudioSessionOptions({
+//       iosCategory: 'playAndRecord',
+//       iosMode: 'spokenAudio',
+//       iosOptions: ['allowBluetooth', 'defaultToSpeaker'],
+//     });
+//     AudioManager.requestRecordingPermissions();
+//   }, []);
+
+//   const handleTranscribeFromURL = async () => {
+//     if (!audioURL.trim()) {
+//       console.warn('Please provide a valid audio file URL');
+//       return;
+//     }
+
+//     const { uri } = await FileSystem.downloadAsync(
+//       audioURL,
+//       FileSystem.cacheDirectory + 'audio_file'
+//     );
+
+//     const audioContext = new AudioContext({ sampleRate: 16000 });
+
+//     try {
+//       const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
+//       const audioBuffer = decodedAudioData.getChannelData(0);
+//       setTranscription(await model.transcribe(audioBuffer));
+//     } catch (error) {
+//       console.error('Error decoding audio data', error);
+//       console.warn('Note: Supported file formats: mp3, wav, flac');
+//       return;
+//     }
+//   };
+
+//   const handleStartTranscribeFromMicrophone = async () => {
+//     setLiveTranscribing(true);
+//     setTranscription('');
+//     recorder.onAudioReady(({ buffer }) => {
+//       model.streamInsert(buffer.getChannelData(0));
+//     });
+//     recorder.start();
+
+//     try {
+//       await model.stream();
+//     } catch (error) {
+//       console.error('Error during live transcription:', error);
+//     }
+//   };
+
+//   const handleStopTranscribeFromMicrophone = () => {
+//     recorder.stop();
+//     model.streamStop();
+//     console.log('Live transcription stopped');
+//     setLiveTranscribing(false);
+//   };
+
+//   const getModelStatus = () => {
+//     if (model.error) return `${model.error}`;
+//     if (model.isGenerating) return 'Transcribing...';
+//     if (model.isReady) return 'Ready to transcribe';
+//     return `Loading model: ${(100 * model.downloadProgress).toFixed(2)}%`;
+//   };
+
+//   const readyToTranscribe = !model.isGenerating && model.isReady;
+//   const recordingButtonDisabled = isSimulator || !readyToTranscribe;
+
+//   return (
+//     <SafeAreaProvider>
+//       <SafeAreaView style={styles.container}>
+//         <KeyboardAvoidingView
+//           style={styles.keyboardAvoidingView}
+//           behavior={Platform.OS === 'ios' ? 'padding' : undefined}
+//         >
+//           <View style={styles.header}>
+//             <TouchableOpacity style={styles.backButton} onPress={onBack}>
+//               <FontAwesome name="chevron-left" size={20} color="#0f186e" />
+//             </TouchableOpacity>
+//             <SWMIcon width={60} height={60} />
+//             <Text style={styles.headerText}>React Native ExecuTorch</Text>
+//             <Text style={styles.headerText}>Speech to Text</Text>
+//           </View>
+
+//           <View style={styles.statusContainer}>
+//             <Text>Status: {getModelStatus()}</Text>
+//           </View>
+
+//           <View style={styles.transcriptionContainer}>
+//             <Text style={styles.transcriptionLabel}>Transcription</Text>
+//             <ScrollView
+//               ref={scrollViewRef}
+//               style={styles.transcriptionScrollContainer}
+//               onContentSizeChange={() =>
+//                 scrollViewRef.current?.scrollToEnd({ animated: true })
+//               }
+//             >
+//               <Text>
+//                 {transcription !== ''
+//                   ? transcription
+//                   : model.committedTranscription +
+//                     model.nonCommittedTranscription}
+//               </Text>
+//             </ScrollView>
+//           </View>
+
+//           <View style={styles.inputContainer}>
+//             <View style={styles.urlTranscriptionContainer}>
+//               <TextInput
+//                 placeholder="Audio file URL to transcribe"
+//                 style={styles.urlTranscriptionInput}
+//                 value={audioURL}
+//                 onChangeText={setAudioURL}
+//               />
+//               <TouchableOpacity
+//                 disabled={!readyToTranscribe}
+//                 onPress={handleTranscribeFromURL}
+//                 style={[
+//                   styles.urlTranscriptionButton,
+//                   !readyToTranscribe && styles.disabled,
+//                 ]}
+//               >
+//                 <Text style={styles.buttonText}>Start</Text>
+//               </TouchableOpacity>
+//             </View>
+
+//             {liveTranscribing ? (
+//               <TouchableOpacity
+//                 onPress={handleStopTranscribeFromMicrophone}
+//                 style={[styles.liveTranscriptionButton, styles.backgroundRed]}
+//               >
+//                 <FontAwesome name="microphone-slash" size={22} color="white" />
+//                 <Text style={styles.buttonText}> Stop Live Transcription</Text>
+//               </TouchableOpacity>
+//             ) : (
+//               <TouchableOpacity
+//                 disabled={recordingButtonDisabled}
+//                 onPress={handleStartTranscribeFromMicrophone}
+//                 style={[
+//                   styles.liveTranscriptionButton,
+//                   styles.backgroundBlue,
+//                   recordingButtonDisabled && styles.disabled,
+//                 ]}
+//               >
+//                 <FontAwesome name="microphone" size={20} color="white" />
+//                 <Text style={styles.buttonText}>
+//                   {isSimulator
+//                     ? 'Recording is not available on Simulator'
+//                     : 'Start Live Transcription'}
+//                 </Text>
+//               </TouchableOpacity>
+//             )}
+//           </View>
+//         </KeyboardAvoidingView>
+//       </SafeAreaView>
+//     </SafeAreaProvider>
+//   );
+// };
+
+// const styles = StyleSheet.create({
+//   container: {
+//     flex: 1,
+//     alignItems: 'center',
+//     backgroundColor: 'white',
+//     paddingHorizontal: 16,
+//   },
+//   keyboardAvoidingView: {
+//     flex: 1,
+//     width: '100%',
+//   },
+//   header: {
+//     alignItems: 'center',
+//     position: 'relative',
+//     width: '100%',
+//   },
+//   backButton: {
+//     position: 'absolute',
+//     left: 0,
+//     top: 10,
+//     padding: 10,
+//     zIndex: 1,
+//   },
+//   headerText: {
+//     fontSize: 22,
+//     fontWeight: 'bold',
+//     color: '#0f186e',
+//   },
+//   statusContainer: {
+//     marginTop: 12,
+//     alignItems: 'center',
+//   },
+//   transcriptionContainer: {
+//     flex: 1,
+//     width: '100%',
+//     marginVertical: 12,
+//   },
+//   transcriptionLabel: {
+//     marginLeft: 12,
+//     marginBottom: 4,
+//     color: '#0f186e',
+//   },
+//   transcriptionScrollContainer: {
+//     borderRadius: 12,
+//     borderWidth: 1,
+//     borderColor: '#0f186e',
+//     padding: 12,
+//   },
+//   inputContainer: {
+//     marginBottom: 12,
+//   },
+//   urlTranscriptionContainer: {
+//     width: '100%',
+//     flexDirection: 'row',
+//   },
+//   urlTranscriptionInput: {
+//     flex: 1,
+//     padding: 12,
+//     borderTopLeftRadius: 12,
+//     borderBottomLeftRadius: 12,
+//     borderWidth: 1,
+//     borderColor: '#0f186e',
+//     borderRightWidth: 0,
+//   },
+//   urlTranscriptionButton: {
+//     backgroundColor: '#0f186e',
+//     justifyContent: 'center',
+//     alignItems: 'center',
+//     padding: 12,
+//     borderTopRightRadius: 12,
+//     borderBottomRightRadius: 12,
+//   },
+//   buttonText: {
+//     color: 'white',
+//     fontWeight: '600',
+//     letterSpacing: -0.5,
+//     fontSize: 16,
+//   },
+//   liveTranscriptionButton: {
+//     flexDirection: 'row',
+//     justifyContent: 'center',
+//     alignItems: 'center',
+//     padding: 12,
+//     borderRadius: 12,
+//     marginTop: 12,
+//     gap: 8,
+//   },
+//   backgroundRed: {
+//     backgroundColor: 'red',
+//   },
+//   backgroundBlue: {
+//     backgroundColor: '#0f186e',
+//   },
+//   disabled: {
+//     opacity: 0.5,
+//   },
+// });
+
 import React, { useEffect, useRef, useState } from 'react';
 import {
   Text,
@@ -10,7 +307,12 @@ import {
   Platform,
 } from 'react-native';
 import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
-import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch';
+import {
+  useSpeechToText,
+  WHISPER_TINY_EN,
+  // Make sure Word is exported from your module
+  Word,
+} from 'react-native-executorch';
 import FontAwesome from '@expo/vector-icons/FontAwesome';
 import {
   AudioManager,
@@ -28,7 +330,9 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     model: WHISPER_TINY_EN,
   });
 
-  const [transcription, setTranscription] = useState('');
+  // CHANGE 1: Update state to hold Word[] instead of string
+  const [transcription, setTranscription] = useState<Word[]>([]);
+
   const [audioURL, setAudioURL] = useState('');
   const [liveTranscribing, setLiveTranscribing] = useState(false);
   const scrollViewRef = useRef<ScrollView>(null);
@@ -50,6 +354,16 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     AudioManager.requestRecordingPermissions();
   }, []);
 
+  const getText = (words: Word[]) => {
+    return words
+      .map((w) => {
+        // Format: "hello (0.00s - 0.50s) "
+        // using toFixed(2) for cleaner timestamp display
+        return `${w.word} (${w.start.toFixed(2)}s - ${w.end.toFixed(2)}s)\n`;
+      })
+      .join('');
+  };
+
   const handleTranscribeFromURL = async () => {
     if (!audioURL.trim()) {
       console.warn('Please provide a valid audio file URL');
@@ -66,6 +380,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     try {
       const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
       const audioBuffer = decodedAudioData.getChannelData(0);
+      // model.transcribe now returns Word[], which matches our state type
       setTranscription(await model.transcribe(audioBuffer));
     } catch (error) {
       console.error('Error decoding audio data', error);
@@ -76,7 +391,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
 
   const handleStartTranscribeFromMicrophone = async () => {
     setLiveTranscribing(true);
-    setTranscription('');
+    setTranscription([]); // Reset to empty array
     recorder.onAudioReady(({ buffer }) => {
       model.streamInsert(buffer.getChannelData(0));
     });
@@ -106,6 +421,13 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
   const readyToTranscribe = !model.isGenerating && model.isReady;
   const recordingButtonDisabled = isSimulator || !readyToTranscribe;
 
+  // CHANGE 3: Prepare the text for rendering
+  const displayedText =
+    transcription.length > 0
+      ? getText(transcription)
+      : getText(model.committedTranscription) +
+        getText(model.nonCommittedTranscription);
+
   return (
     <SafeAreaProvider>
       <SafeAreaView style={styles.container}>
@@ -135,12 +457,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
                 scrollViewRef.current?.scrollToEnd({ animated: true })
               }
             >
-              <Text>
-                {transcription !== ''
-                  ? transcription
-                  : model.committedTranscription +
-                    model.nonCommittedTranscription}
-              </Text>
+              <Text>{displayedText}</Text>
             </ScrollView>
           </View>
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index f43d14180..df7f635d9 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -21,6 +21,8 @@
 #include <rnexecutorch/models/speech_to_text/types/Word.h>
 #include <rnexecutorch/models/voice_activity_detection/Types.h>
 
+using rnexecutorch::models::speech_to_text::types::Word;
+
 namespace rnexecutorch::jsi_conversion {
 
 using namespace facebook;
@@ -64,18 +66,24 @@ getValue<std::shared_ptr<jsi::Function>>(const jsi::Value &val,
 }
 
 template <>
-inline getValue<Word>(const jsi::Value &val, jsi::Runtime &runtime) {
-  jsi::Array jsiArr(rt, words.size());
-  for (size_t i = 0; i < words.size(); ++i) {
-    jsi::Object obj(rt);
-    obj.setProperty(rt, "word",
-                    jsi::String::createFromUtf8(rt, words[i].content));
-    obj.setProperty(rt, "start", static_cast<double>(words[i].start));
-    obj.setProperty(rt, "end", static_cast<double>(words[i].end));
-    jsiArr.setValueAtIndex(rt, i, obj);
-  }
-  return jsiArr;
-};
+inline Word getValue<Word>(const jsi::Value &val, jsi::Runtime &runtime) {
+  jsi::Object obj = val.asObject(runtime);
+  
+  // 1. Extract the string "word" using the existing string helper
+  std::string content = getValue<std::string>(obj.getProperty(runtime, "word"), runtime);
+  
+  // 2. Extract start/end times
+  // We use .asNumber() directly as these are primitives
+  double start = obj.getProperty(runtime, "start").asNumber();
+  double end = obj.getProperty(runtime, "end").asNumber();
+
+  // 3. Construct and return the C++ Word struct
+  return Word{
+      .content = std::move(content),
+      .start = static_cast<float>(start),
+      .end = static_cast<float>(end)
+  };
+}
 
 template <>
 inline JSTensorViewIn getValue<JSTensorViewIn>(const jsi::Value &val,
@@ -305,6 +313,23 @@ inline jsi::Value getJsiValue(std::shared_ptr<jsi::Object> valuePtr,
   return std::move(*valuePtr);
 }
 
+inline jsi::Value getJsiValue(const Word &word, jsi::Runtime &runtime) {
+  jsi::Object obj(runtime);
+  obj.setProperty(runtime, "word", jsi::String::createFromUtf8(runtime, word.content));
+  obj.setProperty(runtime, "start", static_cast<double>(word.start));
+  obj.setProperty(runtime, "end", static_cast<double>(word.end));
+  return obj;
+}
+
+inline jsi::Value getJsiValue(const std::vector<Word> &vec, jsi::Runtime &runtime) {
+  jsi::Array array(runtime, vec.size());
+  for (size_t i = 0; i < vec.size(); ++i) {
+    // Convert each Word using the helper above and place in array
+    array.setValueAtIndex(runtime, i, getJsiValue(vec[i], runtime));
+  }
+  return {runtime, array};
+}
+
 inline jsi::Value getJsiValue(const std::vector<int32_t> &vec,
                               jsi::Runtime &runtime) {
   jsi::Array array(runtime, vec.size());
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
index fc96965ec..c71a58e41 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -26,6 +26,8 @@
 #include <rnexecutorch/models/vertical_ocr/VerticalOCR.h>
 #include <rnexecutorch/threads/GlobalThreadPool.h>
 
+using rnexecutorch::models::speech_to_text::types::Word;
+
 namespace rnexecutorch {
 
 template <typename Model> class ModelHostObject : public JsiHostObject {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
index 7f8ef81cf..a97edfcb9 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
@@ -43,28 +43,28 @@ SpeechToText::decode(std::span<int32_t> tokens,
   return std::make_shared<OwningArrayBuffer>(decoderOutput);
 }
 
-std::vector<char> SpeechToText::transcribe(std::span<float> waveform,
-                                           std::string languageOption) const {
-  std::vector<Segment> segments =
-      this->asr->transcribe(waveform, DecodingOptions(languageOption));
-  std::string transcription;
-
-  size_t transcriptionLength = 0;
-  for (auto &segment : segments) {
-    for (auto &word : segment.words) {
-      transcriptionLength += word.content.size();
-    }
-  }
-  transcription.reserve(transcriptionLength);
+// std::vector<char> SpeechToText::transcribe(std::span<float> waveform,
+//                                            std::string languageOption) const {
+//   std::vector<Segment> segments =
+//       this->asr->transcribe(waveform, DecodingOptions(languageOption));
+//   std::string transcription;
+
+//   size_t transcriptionLength = 0;
+//   for (auto &segment : segments) {
+//     for (auto &word : segment.words) {
+//       transcriptionLength += word.content.size();
+//     }
+//   }
+//   transcription.reserve(transcriptionLength);
 
-  for (auto &segment : segments) {
-    for (auto &word : segment.words) {
-      transcription += word.content;
-    }
-  }
+//   for (auto &segment : segments) {
+//     for (auto &word : segment.words) {
+//       transcription += word.content;
+//     }
+//   }
 
-  return {transcription.begin(), transcription.end()};
-}
+//   return {transcription.begin(), transcription.end()};
+// }
 
 std::vector<Word> SpeechToText::transcribe(std::span<float> waveform,
                                            std::string languageOption) const {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp
index b8a7aced4..f62986b72 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp
@@ -35,7 +35,11 @@ ProcessResult OnlineASRProcessor::processIter(const DecodingOptions &options) {
   }
 
   std::deque<Word> nonCommittedWords = this->hypothesisBuffer.complete();
-  return {this->toFlush(flushed), this->toFlush(nonCommittedWords)};
+  // return {this->toFlush(flushed), this->toFlush(nonCommittedWords)};
+  return {std::vector<Word>(std::make_move_iterator(flushed.begin()),
+                           std::make_move_iterator(flushed.end())), 
+                           std::vector<Word>(std::make_move_iterator(nonCommittedWords.begin()),
+                           std::make_move_iterator(nonCommittedWords.end()))};
 }
 
 void OnlineASRProcessor::chunkCompletedSegment(std::span<const Segment> res) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h
index 0cb05e5a6..685ba2b76 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h
@@ -4,9 +4,14 @@
 
 namespace rnexecutorch::models::speech_to_text::types {
 
+// struct ProcessResult {
+//   std::string committed;
+//   std::string nonCommitted;
+// };
+
 struct ProcessResult {
-  std::string committed;
-  std::string nonCommitted;
+  std::vector<Word> committed;
+  std::vector<Word> nonCommitted;
 };
 
 } // namespace rnexecutorch::models::speech_to_text::types

From c0218bfb8de675935be934bfcbca340ccc715293 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 21 Jan 2026 15:36:31 +0100
Subject: [PATCH 04/14] Working version of both timestamping and regular
 version

---
 apps/speech/screens/SpeechToTextScreen.tsx    | 414 +++++++++++++++++-
 .../host_objects/ModelHostObject.h            |   5 +
 .../models/speech_to_text/SpeechToText.cpp    | 109 +++--
 .../models/speech_to_text/SpeechToText.h      |   6 +-
 .../useSpeechToText.ts                        | 243 ++++++++--
 .../SpeechToTextModule.ts                     | 290 ++++--------
 .../react-native-executorch/src/types/stt.ts  |   1 +
 7 files changed, 755 insertions(+), 313 deletions(-)

diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index ad78dcb49..9dab4420b 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -295,6 +295,323 @@
 //   },
 // });
 
+// import React, { useEffect, useRef, useState } from 'react';
+// import {
+//   Text,
+//   View,
+//   StyleSheet,
+//   TouchableOpacity,
+//   ScrollView,
+//   TextInput,
+//   KeyboardAvoidingView,
+//   Platform,
+// } from 'react-native';
+// import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
+// import {
+//   useSpeechToText,
+//   WHISPER_TINY_EN,
+//   // Make sure Word is exported from your module
+//   Word,
+// } from 'react-native-executorch';
+// import FontAwesome from '@expo/vector-icons/FontAwesome';
+// import {
+//   AudioManager,
+//   AudioRecorder,
+//   AudioContext,
+// } from 'react-native-audio-api';
+// import * as FileSystem from 'expo-file-system/legacy';
+// import SWMIcon from '../assets/swm_icon.svg';
+// import DeviceInfo from 'react-native-device-info';
+
+// const isSimulator = DeviceInfo.isEmulatorSync();
+
+// export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
+//   const model = useSpeechToText({
+//     model: WHISPER_TINY_EN,
+//   });
+
+//   // CHANGE 1: Update state to hold Word[] instead of string
+//   const [transcription, setTranscription] = useState<Word[]>([]);
+
+//   const [audioURL, setAudioURL] = useState('');
+//   const [liveTranscribing, setLiveTranscribing] = useState(false);
+//   const scrollViewRef = useRef<ScrollView>(null);
+
+//   const [recorder] = useState(
+//     () =>
+//       new AudioRecorder({
+//         sampleRate: 16000,
+//         bufferLengthInSamples: 1600,
+//       })
+//   );
+
+//   useEffect(() => {
+//     AudioManager.setAudioSessionOptions({
+//       iosCategory: 'playAndRecord',
+//       iosMode: 'spokenAudio',
+//       iosOptions: ['allowBluetooth', 'defaultToSpeaker'],
+//     });
+//     AudioManager.requestRecordingPermissions();
+//   }, []);
+
+//   const getText = (words: Word[]) => {
+//     return words
+//       .map((w) => {
+//         // Format: "hello (0.00s - 0.50s) "
+//         // using toFixed(2) for cleaner timestamp display
+//         return `${w.word} (${w.start.toFixed(2)}s - ${w.end.toFixed(2)}s)\n`;
+//       })
+//       .join('');
+//   };
+
+//   const handleTranscribeFromURL = async () => {
+//     if (!audioURL.trim()) {
+//       console.warn('Please provide a valid audio file URL');
+//       return;
+//     }
+
+//     const { uri } = await FileSystem.downloadAsync(
+//       audioURL,
+//       FileSystem.cacheDirectory + 'audio_file'
+//     );
+
+//     const audioContext = new AudioContext({ sampleRate: 16000 });
+
+//     try {
+//       const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
+//       const audioBuffer = decodedAudioData.getChannelData(0);
+//       // model.transcribe now returns Word[], which matches our state type
+//       setTranscription(await model.transcribe(audioBuffer));
+//     } catch (error) {
+//       console.error('Error decoding audio data', error);
+//       console.warn('Note: Supported file formats: mp3, wav, flac');
+//       return;
+//     }
+//   };
+
+//   const handleStartTranscribeFromMicrophone = async () => {
+//     setLiveTranscribing(true);
+//     setTranscription([]); // Reset to empty array
+//     recorder.onAudioReady(({ buffer }) => {
+//       model.streamInsert(buffer.getChannelData(0));
+//     });
+//     recorder.start();
+
+//     try {
+//       await model.stream();
+//     } catch (error) {
+//       console.error('Error during live transcription:', error);
+//     }
+//   };
+
+//   const handleStopTranscribeFromMicrophone = () => {
+//     recorder.stop();
+//     model.streamStop();
+//     console.log('Live transcription stopped');
+//     setLiveTranscribing(false);
+//   };
+
+//   const getModelStatus = () => {
+//     if (model.error) return `${model.error}`;
+//     if (model.isGenerating) return 'Transcribing...';
+//     if (model.isReady) return 'Ready to transcribe';
+//     return `Loading model: ${(100 * model.downloadProgress).toFixed(2)}%`;
+//   };
+
+//   const readyToTranscribe = !model.isGenerating && model.isReady;
+//   const recordingButtonDisabled = isSimulator || !readyToTranscribe;
+
+//   // CHANGE 3: Prepare the text for rendering
+//   const displayedText =
+//     transcription.length > 0
+//       ? getText(transcription)
+//       : getText(model.committedTranscription) +
+//         getText(model.nonCommittedTranscription);
+
+//   return (
+//     <SafeAreaProvider>
+//       <SafeAreaView style={styles.container}>
+//         <KeyboardAvoidingView
+//           style={styles.keyboardAvoidingView}
+//           behavior={Platform.OS === 'ios' ? 'padding' : undefined}
+//         >
+//           <View style={styles.header}>
+//             <TouchableOpacity style={styles.backButton} onPress={onBack}>
+//               <FontAwesome name="chevron-left" size={20} color="#0f186e" />
+//             </TouchableOpacity>
+//             <SWMIcon width={60} height={60} />
+//             <Text style={styles.headerText}>React Native ExecuTorch</Text>
+//             <Text style={styles.headerText}>Speech to Text</Text>
+//           </View>
+
+//           <View style={styles.statusContainer}>
+//             <Text>Status: {getModelStatus()}</Text>
+//           </View>
+
+//           <View style={styles.transcriptionContainer}>
+//             <Text style={styles.transcriptionLabel}>Transcription</Text>
+//             <ScrollView
+//               ref={scrollViewRef}
+//               style={styles.transcriptionScrollContainer}
+//               onContentSizeChange={() =>
+//                 scrollViewRef.current?.scrollToEnd({ animated: true })
+//               }
+//             >
+//               <Text>{displayedText}</Text>
+//             </ScrollView>
+//           </View>
+
+//           <View style={styles.inputContainer}>
+//             <View style={styles.urlTranscriptionContainer}>
+//               <TextInput
+//                 placeholder="Audio file URL to transcribe"
+//                 style={styles.urlTranscriptionInput}
+//                 value={audioURL}
+//                 onChangeText={setAudioURL}
+//               />
+//               <TouchableOpacity
+//                 disabled={!readyToTranscribe}
+//                 onPress={handleTranscribeFromURL}
+//                 style={[
+//                   styles.urlTranscriptionButton,
+//                   !readyToTranscribe && styles.disabled,
+//                 ]}
+//               >
+//                 <Text style={styles.buttonText}>Start</Text>
+//               </TouchableOpacity>
+//             </View>
+
+//             {liveTranscribing ? (
+//               <TouchableOpacity
+//                 onPress={handleStopTranscribeFromMicrophone}
+//                 style={[styles.liveTranscriptionButton, styles.backgroundRed]}
+//               >
+//                 <FontAwesome name="microphone-slash" size={22} color="white" />
+//                 <Text style={styles.buttonText}> Stop Live Transcription</Text>
+//               </TouchableOpacity>
+//             ) : (
+//               <TouchableOpacity
+//                 disabled={recordingButtonDisabled}
+//                 onPress={handleStartTranscribeFromMicrophone}
+//                 style={[
+//                   styles.liveTranscriptionButton,
+//                   styles.backgroundBlue,
+//                   recordingButtonDisabled && styles.disabled,
+//                 ]}
+//               >
+//                 <FontAwesome name="microphone" size={20} color="white" />
+//                 <Text style={styles.buttonText}>
+//                   {isSimulator
+//                     ? 'Recording is not available on Simulator'
+//                     : 'Start Live Transcription'}
+//                 </Text>
+//               </TouchableOpacity>
+//             )}
+//           </View>
+//         </KeyboardAvoidingView>
+//       </SafeAreaView>
+//     </SafeAreaProvider>
+//   );
+// };
+
+// const styles = StyleSheet.create({
+//   container: {
+//     flex: 1,
+//     alignItems: 'center',
+//     backgroundColor: 'white',
+//     paddingHorizontal: 16,
+//   },
+//   keyboardAvoidingView: {
+//     flex: 1,
+//     width: '100%',
+//   },
+//   header: {
+//     alignItems: 'center',
+//     position: 'relative',
+//     width: '100%',
+//   },
+//   backButton: {
+//     position: 'absolute',
+//     left: 0,
+//     top: 10,
+//     padding: 10,
+//     zIndex: 1,
+//   },
+//   headerText: {
+//     fontSize: 22,
+//     fontWeight: 'bold',
+//     color: '#0f186e',
+//   },
+//   statusContainer: {
+//     marginTop: 12,
+//     alignItems: 'center',
+//   },
+//   transcriptionContainer: {
+//     flex: 1,
+//     width: '100%',
+//     marginVertical: 12,
+//   },
+//   transcriptionLabel: {
+//     marginLeft: 12,
+//     marginBottom: 4,
+//     color: '#0f186e',
+//   },
+//   transcriptionScrollContainer: {
+//     borderRadius: 12,
+//     borderWidth: 1,
+//     borderColor: '#0f186e',
+//     padding: 12,
+//   },
+//   inputContainer: {
+//     marginBottom: 12,
+//   },
+//   urlTranscriptionContainer: {
+//     width: '100%',
+//     flexDirection: 'row',
+//   },
+//   urlTranscriptionInput: {
+//     flex: 1,
+//     padding: 12,
+//     borderTopLeftRadius: 12,
+//     borderBottomLeftRadius: 12,
+//     borderWidth: 1,
+//     borderColor: '#0f186e',
+//     borderRightWidth: 0,
+//   },
+//   urlTranscriptionButton: {
+//     backgroundColor: '#0f186e',
+//     justifyContent: 'center',
+//     alignItems: 'center',
+//     padding: 12,
+//     borderTopRightRadius: 12,
+//     borderBottomRightRadius: 12,
+//   },
+//   buttonText: {
+//     color: 'white',
+//     fontWeight: '600',
+//     letterSpacing: -0.5,
+//     fontSize: 16,
+//   },
+//   liveTranscriptionButton: {
+//     flexDirection: 'row',
+//     justifyContent: 'center',
+//     alignItems: 'center',
+//     padding: 12,
+//     borderRadius: 12,
+//     marginTop: 12,
+//     gap: 8,
+//   },
+//   backgroundRed: {
+//     backgroundColor: 'red',
+//   },
+//   backgroundBlue: {
+//     backgroundColor: '#0f186e',
+//   },
+//   disabled: {
+//     opacity: 0.5,
+//   },
+// });
+
 import React, { useEffect, useRef, useState } from 'react';
 import {
   Text,
@@ -305,12 +622,12 @@ import {
   TextInput,
   KeyboardAvoidingView,
   Platform,
+  Switch, // Import Switch
 } from 'react-native';
 import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
 import {
   useSpeechToText,
   WHISPER_TINY_EN,
-  // Make sure Word is exported from your module
   Word,
 } from 'react-native-executorch';
 import FontAwesome from '@expo/vector-icons/FontAwesome';
@@ -330,8 +647,11 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     model: WHISPER_TINY_EN,
   });
 
-  // CHANGE 1: Update state to hold Word[] instead of string
-  const [transcription, setTranscription] = useState<Word[]>([]);
+  // CHANGE 1: State can now be string OR Word[]
+  const [transcription, setTranscription] = useState<string | Word[]>('');
+
+  // CHANGE 2: Add toggle for timestamps
+  const [enableTimestamps, setEnableTimestamps] = useState(false);
 
   const [audioURL, setAudioURL] = useState('');
   const [liveTranscribing, setLiveTranscribing] = useState(false);
@@ -354,17 +674,20 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     AudioManager.requestRecordingPermissions();
   }, []);
 
-  const getText = (words: Word[]) => {
-    return words
-      .map((w) => {
-        // Format: "hello (0.00s - 0.50s) "
-        // using toFixed(2) for cleaner timestamp display
-        return `${w.word} (${w.start.toFixed(2)}s - ${w.end.toFixed(2)}s)\n`;
-      })
+  // CHANGE 3: Smart helper that handles both formats
+  const getText = (data: string | Word[] | undefined) => {
+    console.log('UI Received:', JSON.stringify(data));
+    if (!data) return '';
+    if (typeof data === 'string') return data;
+
+    // It's Word[], format with timestamps
+    return data
+      .map((w) => `${w.word} (${w.start.toFixed(2)}s - ${w.end.toFixed(2)}s)\n`)
       .join('');
   };
 
   const handleTranscribeFromURL = async () => {
+    console.log('[1] UI: Button Pressed. Calling model.stream()...');
     if (!audioURL.trim()) {
       console.warn('Please provide a valid audio file URL');
       return;
@@ -380,8 +703,20 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     try {
       const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
       const audioBuffer = decodedAudioData.getChannelData(0);
-      // model.transcribe now returns Word[], which matches our state type
-      setTranscription(await model.transcribe(audioBuffer));
+
+      // CHANGE 4: Pass the toggle flag to transcribe
+      // TypeScript will infer the return type based on the flag
+      if (enableTimestamps) {
+        const result = await model.transcribe(audioBuffer, {
+          enableTimestamps: true,
+        });
+        setTranscription(result);
+      } else {
+        const result = await model.transcribe(audioBuffer, {
+          enableTimestamps: false,
+        });
+        setTranscription(result);
+      }
     } catch (error) {
       console.error('Error decoding audio data', error);
       console.warn('Note: Supported file formats: mp3, wav, flac');
@@ -391,14 +726,21 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
 
   const handleStartTranscribeFromMicrophone = async () => {
     setLiveTranscribing(true);
-    setTranscription([]); // Reset to empty array
+    // Reset based on mode
+    setTranscription(enableTimestamps ? [] : '');
+
     recorder.onAudioReady(({ buffer }) => {
       model.streamInsert(buffer.getChannelData(0));
     });
     recorder.start();
 
     try {
-      await model.stream();
+      // CHANGE 5: Pass the toggle flag to stream
+      if (enableTimestamps) {
+        await model.stream({ enableTimestamps: true });
+      } else {
+        await model.stream({ enableTimestamps: false });
+      }
     } catch (error) {
       console.error('Error during live transcription:', error);
     }
@@ -421,12 +763,16 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
   const readyToTranscribe = !model.isGenerating && model.isReady;
   const recordingButtonDisabled = isSimulator || !readyToTranscribe;
 
-  // CHANGE 3: Prepare the text for rendering
-  const displayedText =
-    transcription.length > 0
-      ? getText(transcription)
-      : getText(model.committedTranscription) +
-        getText(model.nonCommittedTranscription);
+  // CHANGE 6: Logic to choose what text to display
+  // We use getText() on everything so it converts Arrays to Strings before concatenation
+  const hasResult = Array.isArray(transcription)
+    ? transcription.length > 0
+    : transcription.length > 0;
+
+  const displayedText = hasResult
+    ? getText(transcription)
+    : getText(model.committedTranscription) +
+      getText(model.nonCommittedTranscription);
 
   return (
     <SafeAreaProvider>
@@ -448,6 +794,21 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
             <Text>Status: {getModelStatus()}</Text>
           </View>
 
+          {/* CHANGE 7: Add UI for the Toggle */}
+          <View style={styles.toggleContainer}>
+            <Text style={styles.toggleLabel}>Enable Timestamps</Text>
+            <Switch
+              value={enableTimestamps}
+              onValueChange={(val) => {
+                setEnableTimestamps(val);
+                setTranscription(val ? [] : ''); // Reset transcription on toggle
+              }}
+              trackColor={{ false: '#767577', true: '#0f186e' }}
+              thumbColor={enableTimestamps ? '#fff' : '#f4f3f4'}
+              disabled={model.isGenerating} // Disable changing mode while running
+            />
+          </View>
+
           <View style={styles.transcriptionContainer}>
             <Text style={styles.transcriptionLabel}>Transcription</Text>
             <ScrollView
@@ -515,6 +876,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
 };
 
 const styles = StyleSheet.create({
+  // ... existing styles ...
   container: {
     flex: 1,
     alignItems: 'center',
@@ -546,6 +908,18 @@ const styles = StyleSheet.create({
     marginTop: 12,
     alignItems: 'center',
   },
+  // New style for the toggle
+  toggleContainer: {
+    flexDirection: 'row',
+    alignItems: 'center',
+    marginTop: 10,
+    marginBottom: 5,
+  },
+  toggleLabel: {
+    fontSize: 16,
+    marginRight: 10,
+    color: '#0f186e',
+  },
   transcriptionContainer: {
     flex: 1,
     width: '100%',
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
index c71a58e41..f0ec05b64 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -75,6 +75,11 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
                                        promiseHostFunction<&Model::transcribe>,
                                        "transcribe"));
 
+      addFunctions(
+          JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
+                              promiseHostFunction<&Model::transcribeStringOnly>,
+                              "transcribeStringOnly"));
+
       addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
                                        promiseHostFunction<&Model::stream>,
                                        "stream"));
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
index a97edfcb9..68e63d612 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
@@ -44,7 +44,8 @@ SpeechToText::decode(std::span<int32_t> tokens,
 }
 
 // std::vector<char> SpeechToText::transcribe(std::span<float> waveform,
-//                                            std::string languageOption) const {
+//                                            std::string languageOption) const
+//                                            {
 //   std::vector<Segment> segments =
 //       this->asr->transcribe(waveform, DecodingOptions(languageOption));
 //   std::string transcription;
@@ -85,21 +86,47 @@ std::vector<Word> SpeechToText::transcribe(std::span<float> waveform,
     }
   }
 
-  auto wordsToJsi = [](jsi::Runtime &rt,
-                       const std::vector<Word> &words) -> jsi::Value {
-    jsi::Array jsiArr(rt, words.size());
-    for (size_t i = 0; i < words.size(); ++i) {
-      jsi::Object obj(rt);
-      obj.setProperty(rt, "word",
-                      jsi::String::createFromUtf8(rt, words[i].content));
-      obj.setProperty(rt, "start", static_cast<double>(words[i].start));
-      obj.setProperty(rt, "end", static_cast<double>(words[i].end));
-      jsiArr.setValueAtIndex(rt, i, obj);
+  return transcription;
+}
+
+std::vector<char>
+SpeechToText::transcribeStringOnly(std::span<float> waveform,
+                                   std::string languageOption) const {
+  std::vector<Segment> segments =
+      this->asr->transcribe(waveform, DecodingOptions(languageOption));
+  std::string transcription;
+
+  size_t transcriptionLength = 0;
+  for (auto &segment : segments) {
+    for (auto &word : segment.words) {
+      transcriptionLength += word.content.size();
     }
-    return jsiArr;
-  };
+  }
+  transcription.reserve(transcriptionLength);
 
-  return transcription;
+  for (auto &segment : segments) {
+    for (auto &word : segment.words) {
+      transcription += word.content;
+    }
+  }
+
+  return {transcription.begin(), transcription.end()};
+}
+
+std::vector<char> mergeWordsToString(const std::vector<Word> &words) {
+  std::string result;
+  size_t totalLength = 0;
+
+  for (const auto &word : words) {
+    totalLength += word.content.size();
+  }
+  result.reserve(totalLength);
+
+  for (const auto &word : words) {
+    result += word.content;
+  }
+
+  return {result.begin(), result.end()};
 }
 
 size_t SpeechToText::getMemoryLowerBound() const noexcept {
@@ -153,38 +180,25 @@ size_t SpeechToText::getMemoryLowerBound() const noexcept {
 // }
 
 void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
-                          std::string languageOption) {
+                          std::string languageOption, bool enableTimestamps) {
   if (this->isStreaming) {
     throw RnExecutorchError(RnExecutorchErrorCode::StreamingInProgress,
                             "Streaming is already in progress!");
   }
 
-  auto wordsToJsi = [](jsi::Runtime &rt,
-                       const std::vector<Word> &words) -> jsi::Value {
-    jsi::Array jsiArr(rt, words.size());
-    for (size_t i = 0; i < words.size(); ++i) {
-      jsi::Object obj(rt);
-      obj.setProperty(rt, "word",
-                      jsi::String::createFromUtf8(rt, words[i].content));
-      obj.setProperty(rt, "start", static_cast<double>(words[i].start));
-      obj.setProperty(rt, "end", static_cast<double>(words[i].end));
-      jsiArr.setValueAtIndex(rt, i, obj);
-    }
-    return jsiArr;
-  };
-
-  auto nativeCallback = [this, callback,
-                         wordsToJsi](const std::vector<Word> &committedVec,
-                                     const std::vector<Word> &nonCommittedVec,
-                                     bool isDone) {
-    this->callInvoker->invokeAsync([callback, committedVec, nonCommittedVec,
-                                    isDone, wordsToJsi](jsi::Runtime &rt) {
-      jsi::Value committedJsi = wordsToJsi(rt, committedVec);
-      jsi::Value nonCommittedJsi = wordsToJsi(rt, nonCommittedVec);
-
-      callback->call(rt, std::move(committedJsi), std::move(nonCommittedJsi),
-                     jsi::Value(isDone));
-    });
+  auto nativeCallback = [this, callback](const auto &committedVec,
+                                         const auto &nonCommittedVec,
+                                         bool isDone) {
+    this->callInvoker->invokeAsync(
+        [callback, committedVec, nonCommittedVec, isDone](jsi::Runtime &rt) {
+          jsi::Value committedJsi =
+              rnexecutorch::jsi_conversion::getJsiValue(committedVec, rt);
+          jsi::Value nonCommittedJsi =
+              rnexecutorch::jsi_conversion::getJsiValue(nonCommittedVec, rt);
+
+          callback->call(rt, std::move(committedJsi),
+                         std::move(nonCommittedJsi), jsi::Value(isDone));
+        });
   };
 
   this->isStreaming = true;
@@ -197,14 +211,23 @@ void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
     ProcessResult res =
         this->processor->processIter(DecodingOptions(languageOption));
 
-    nativeCallback(res.committed, res.nonCommitted, false);
+    if (enableTimestamps) {
+      nativeCallback(res.committed, res.nonCommitted, false);
+    } else {
+      nativeCallback(mergeWordsToString(res.committed),
+                     mergeWordsToString(res.nonCommitted), false);
+    }
     this->readyToProcess = false;
   }
 
   // finish() now returns std::vector<Word>
   std::vector<Word> committed = this->processor->finish();
 
-  nativeCallback(committed, {}, true);
+  if (enableTimestamps) {
+    nativeCallback(committed, std::vector<Word>{}, true);
+  } else {
+    nativeCallback(mergeWordsToString(committed), std::vector<char>(), true);
+  }
 
   this->resetStreamState();
 }
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
index af02a5357..8f6799c4e 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
@@ -27,11 +27,15 @@ class SpeechToText {
   [[nodiscard("Registered non-void function")]] std::vector<Word>
   transcribe(std::span<float> waveform, std::string languageOption) const;
 
+  [[nodiscard("Registered non-void function")]]
+  std::vector<char> transcribeStringOnly(std::span<float> waveform,
+                                         std::string languageOption) const;
+
   size_t getMemoryLowerBound() const noexcept;
 
   // Stream
   void stream(std::shared_ptr<jsi::Function> callback,
-              std::string languageOption);
+              std::string languageOption, bool enableTimestamps);
   void streamStop();
   void streamInsert(std::span<float> waveform);
 
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
index 6f22bf7b5..7f42d33cb 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -136,8 +136,164 @@
 //   };
 // };
 
+// import { useEffect, useCallback, useState } from 'react';
+// // Make sure Word is exported from your module file
+// import {
+//   SpeechToTextModule,
+//   Word,
+// } from '../../modules/natural_language_processing/SpeechToTextModule';
+// import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt';
+// import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
+// import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
+
+// export const useSpeechToText = ({
+//   model,
+//   preventLoad = false,
+// }: {
+//   model: SpeechToTextModelConfig;
+//   preventLoad?: boolean;
+// }) => {
+//   const [error, setError] = useState<null | RnExecutorchError>(null);
+//   const [isReady, setIsReady] = useState(false);
+//   const [isGenerating, setIsGenerating] = useState(false);
+//   const [downloadProgress, setDownloadProgress] = useState(0);
+
+//   const [modelInstance] = useState(() => new SpeechToTextModule());
+
+//   // FIX 1: Initialize with empty array [], generic type Word[]
+//   const [committedTranscription, setCommittedTranscription] = useState<Word[]>(
+//     []
+//   );
+//   const [nonCommittedTranscription, setNonCommittedTranscription] = useState<
+//     Word[]
+//   >([]);
+
+//   useEffect(() => {
+//     if (preventLoad) return;
+//     (async () => {
+//       setDownloadProgress(0);
+//       setError(null);
+//       try {
+//         setIsReady(false);
+//         await modelInstance.load(
+//           {
+//             isMultilingual: model.isMultilingual,
+//             encoderSource: model.encoderSource,
+//             decoderSource: model.decoderSource,
+//             tokenizerSource: model.tokenizerSource,
+//           },
+//           setDownloadProgress
+//         );
+//         setIsReady(true);
+//       } catch (err) {
+//         setError(parseUnknownError(err));
+//       }
+//     })();
+//   }, [
+//     modelInstance,
+//     model.isMultilingual,
+//     model.encoderSource,
+//     model.decoderSource,
+//     model.tokenizerSource,
+//     preventLoad,
+//   ]);
+
+//   const stateWrapper = useCallback(
+//     <T extends (...args: any[]) => Promise<any>>(fn: T) =>
+//       async (...args: Parameters<T>): Promise<Awaited<ReturnType<T>>> => {
+//         if (!isReady)
+//           throw new RnExecutorchError(
+//             RnExecutorchErrorCode.ModuleNotLoaded,
+//             'The model is currently not loaded. Please load the model before calling this function.'
+//           );
+//         if (isGenerating)
+//           throw new RnExecutorchError(
+//             RnExecutorchErrorCode.ModelGenerating,
+//             'The model is currently generating. Please wait until previous model run is complete.'
+//           );
+//         setIsGenerating(true);
+//         try {
+//           return await fn.apply(modelInstance, args);
+//         } finally {
+//           setIsGenerating(false);
+//         }
+//       },
+//     [isReady, isGenerating, modelInstance]
+//   );
+
+//   const stream = useCallback(
+//     async (options?: DecodingOptions) => {
+//       if (!isReady)
+//         throw new RnExecutorchError(
+//           RnExecutorchErrorCode.ModuleNotLoaded,
+//           'The model is currently not loaded. Please load the model before calling this function.'
+//         );
+//       if (isGenerating)
+//         throw new RnExecutorchError(
+//           RnExecutorchErrorCode.ModelGenerating,
+//           'The model is currently generating. Please wait until previous model run is complete.'
+//         );
+//       setIsGenerating(true);
+
+//       // FIX 2: Reset to empty arrays
+//       setCommittedTranscription([]);
+//       setNonCommittedTranscription([]);
+
+//       // Accumulator is now an array of Words, not a string
+//       const fullResult: Word[] = [];
+
+//       try {
+//         for await (const { committed, nonCommitted } of modelInstance.stream(
+//           options
+//         )) {
+//           // FIX 3: Update state by appending arrays
+//           if (committed.length > 0) {
+//             setCommittedTranscription((prev) => [...prev, ...committed]);
+//             fullResult.push(...committed);
+//           }
+
+//           // nonCommitted is always a fresh partial chunk
+//           setNonCommittedTranscription(nonCommitted);
+//         }
+//       } finally {
+//         setIsGenerating(false);
+//       }
+//       return fullResult;
+//     },
+//     [isReady, isGenerating, modelInstance]
+//   );
+
+//   const wrapper = useCallback(
+//     <T extends (...args: any[]) => any>(fn: T) => {
+//       return (...args: Parameters<T>): ReturnType<T> => {
+//         if (!isReady)
+//           throw new RnExecutorchError(
+//             RnExecutorchErrorCode.ModuleNotLoaded,
+//             'The model is currently not loaded. Please load the model before calling this function.'
+//           );
+//         return fn.apply(modelInstance, args);
+//       };
+//     },
+//     [isReady, modelInstance]
+//   );
+
+//   return {
+//     error,
+//     isReady,
+//     isGenerating,
+//     downloadProgress,
+//     committedTranscription,
+//     nonCommittedTranscription,
+//     encode: stateWrapper(SpeechToTextModule.prototype.encode),
+//     decode: stateWrapper(SpeechToTextModule.prototype.decode),
+//     transcribe: stateWrapper(SpeechToTextModule.prototype.transcribe),
+//     stream,
+//     streamStop: wrapper(SpeechToTextModule.prototype.streamStop),
+//     streamInsert: wrapper(SpeechToTextModule.prototype.streamInsert),
+//   };
+// };
+
 import { useEffect, useCallback, useState } from 'react';
-// Make sure Word is exported from your module file
 import {
   SpeechToTextModule,
   Word,
@@ -160,13 +316,13 @@ export const useSpeechToText = ({
 
   const [modelInstance] = useState(() => new SpeechToTextModule());
 
-  // FIX 1: Initialize with empty array [], generic type Word[]
-  const [committedTranscription, setCommittedTranscription] = useState<Word[]>(
-    []
-  );
+  // FIX 1: Allow state to be either string or Word[]
+  const [committedTranscription, setCommittedTranscription] = useState<
+    string | Word[]
+  >('');
   const [nonCommittedTranscription, setNonCommittedTranscription] = useState<
-    Word[]
-  >([]);
+    string | Word[]
+  >('');
 
   useEffect(() => {
     if (preventLoad) return;
@@ -189,14 +345,7 @@ export const useSpeechToText = ({
         setError(parseUnknownError(err));
       }
     })();
-  }, [
-    modelInstance,
-    model.isMultilingual,
-    model.encoderSource,
-    model.decoderSource,
-    model.tokenizerSource,
-    preventLoad,
-  ]);
+  }, [modelInstance, model, preventLoad]);
 
   const stateWrapper = useCallback(
     <T extends (...args: any[]) => Promise<any>>(fn: T) =>
@@ -204,12 +353,12 @@ export const useSpeechToText = ({
         if (!isReady)
           throw new RnExecutorchError(
             RnExecutorchErrorCode.ModuleNotLoaded,
-            'The model is currently not loaded. Please load the model before calling this function.'
+            'The model is currently not loaded.'
           );
         if (isGenerating)
           throw new RnExecutorchError(
             RnExecutorchErrorCode.ModelGenerating,
-            'The model is currently generating. Please wait until previous model run is complete.'
+            'The model is currently generating.'
           );
         setIsGenerating(true);
         try {
@@ -222,38 +371,66 @@ export const useSpeechToText = ({
   );
 
   const stream = useCallback(
-    async (options?: DecodingOptions) => {
+    async (options?: DecodingOptions & { enableTimestamps?: boolean }) => {
+      console.log(
+        '[2] Hook: Stream called. Ready:',
+        isReady,
+        'Generating:',
+        isGenerating
+      );
       if (!isReady)
         throw new RnExecutorchError(
           RnExecutorchErrorCode.ModuleNotLoaded,
-          'The model is currently not loaded. Please load the model before calling this function.'
+          'Model not loaded'
         );
       if (isGenerating)
         throw new RnExecutorchError(
           RnExecutorchErrorCode.ModelGenerating,
-          'The model is currently generating. Please wait until previous model run is complete.'
+          'Model is generating'
         );
+
       setIsGenerating(true);
 
-      // FIX 2: Reset to empty arrays
-      setCommittedTranscription([]);
-      setNonCommittedTranscription([]);
+      // FIX 2: Reset based on the mode requested
+      const enableTimestamps = options?.enableTimestamps ?? false;
+      setCommittedTranscription(enableTimestamps ? [] : '');
+      setNonCommittedTranscription(enableTimestamps ? [] : '');
 
-      // Accumulator is now an array of Words, not a string
-      const fullResult: Word[] = [];
+      let fullResult: string | Word[] = enableTimestamps ? [] : '';
 
       try {
+        console.log('[3] Hook: Calling modelInstance.stream()');
+        // @ts-ignore - Typescript struggles with the dual generator return type, but logic is safe
         for await (const { committed, nonCommitted } of modelInstance.stream(
           options
         )) {
-          // FIX 3: Update state by appending arrays
-          if (committed.length > 0) {
-            setCommittedTranscription((prev) => [...prev, ...committed]);
-            fullResult.push(...committed);
-          }
+          console.log(committed, nonCommitted);
+          // FIX 3: Dynamic Merging Logic
+          if (typeof committed === 'string') {
+            // --- STRING MODE ---
+            if (committed.length > 0) {
+              setCommittedTranscription((prev) => {
+                // Safety check: if prev was somehow an array, reset it or cast to string
+                const prevStr = typeof prev === 'string' ? prev : '';
+                return prevStr + committed;
+              });
+              (fullResult as string) += committed;
+            }
+            setNonCommittedTranscription(nonCommitted as string);
+          } else {
+            // --- WORD[] MODE ---
+            const committedWords = committed as Word[];
+            const nonCommittedWords = nonCommitted as Word[];
 
-          // nonCommitted is always a fresh partial chunk
-          setNonCommittedTranscription(nonCommitted);
+            if (committedWords.length > 0) {
+              setCommittedTranscription((prev) => {
+                const prevArr = Array.isArray(prev) ? prev : [];
+                return [...prevArr, ...committedWords];
+              });
+              (fullResult as Word[]).push(...committedWords);
+            }
+            setNonCommittedTranscription(nonCommittedWords);
+          }
         }
       } finally {
         setIsGenerating(false);
@@ -269,7 +446,7 @@ export const useSpeechToText = ({
         if (!isReady)
           throw new RnExecutorchError(
             RnExecutorchErrorCode.ModuleNotLoaded,
-            'The model is currently not loaded. Please load the model before calling this function.'
+            'Model not loaded'
           );
         return fn.apply(modelInstance, args);
       };
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
index e0ca88251..5d5fd3248 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
@@ -1,194 +1,3 @@
-// import { Logger } from '../../common/Logger';
-// import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt';
-// import { ResourceFetcher } from '../../utils/ResourceFetcher';
-// import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
-// import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
-
-// export class SpeechToTextModule {
-//   private nativeModule: any;
-
-//   private modelConfig!: SpeechToTextModelConfig;
-
-//   private textDecoder = new TextDecoder('utf-8', {
-//     fatal: false,
-//     ignoreBOM: true,
-//   });
-
-//   public async load(
-//     model: SpeechToTextModelConfig,
-//     onDownloadProgressCallback: (progress: number) => void = () => {}
-//   ) {
-//     this.modelConfig = model;
-
-//     const tokenizerLoadPromise = ResourceFetcher.fetch(
-//       undefined,
-//       model.tokenizerSource
-//     );
-//     const encoderDecoderPromise = ResourceFetcher.fetch(
-//       onDownloadProgressCallback,
-//       model.encoderSource,
-//       model.decoderSource
-//     );
-//     const [tokenizerSources, encoderDecoderResults] = await Promise.all([
-//       tokenizerLoadPromise,
-//       encoderDecoderPromise,
-//     ]);
-//     const encoderSource = encoderDecoderResults?.[0];
-//     const decoderSource = encoderDecoderResults?.[1];
-//     if (!encoderSource || !decoderSource || !tokenizerSources) {
-//       throw new RnExecutorchError(
-//         RnExecutorchErrorCode.DownloadInterrupted,
-//         'The download has been interrupted. As a result, not every file was downloaded. Please retry the download.'
-//       );
-//     }
-//     this.nativeModule = await global.loadSpeechToText(
-//       encoderSource,
-//       decoderSource,
-//       tokenizerSources[0]!
-//     );
-//   }
-
-//   public delete(): void {
-//     this.nativeModule.unload();
-//   }
-
-//   public async encode(
-//     waveform: Float32Array | number[]
-//   ): Promise<Float32Array> {
-//     if (Array.isArray(waveform)) {
-//       Logger.info(
-//         'Passing waveform as number[] is deprecated, use Float32Array instead'
-//       );
-//       waveform = new Float32Array(waveform);
-//     }
-//     return new Float32Array(await this.nativeModule.encode(waveform));
-//   }
-
-//   public async decode(
-//     tokens: Int32Array | number[],
-//     encoderOutput: Float32Array | number[]
-//   ): Promise<Float32Array> {
-//     if (Array.isArray(tokens)) {
-//       Logger.info(
-//         'Passing tokens as number[] is deprecated, use Int32Array instead'
-//       );
-//       tokens = new Int32Array(tokens);
-//     }
-//     if (Array.isArray(encoderOutput)) {
-//       Logger.info(
-//         'Passing encoderOutput as number[] is deprecated, use Float32Array instead'
-//       );
-//       encoderOutput = new Float32Array(encoderOutput);
-//     }
-//     return new Float32Array(
-//       await this.nativeModule.decode(tokens, encoderOutput)
-//     );
-//   }
-
-//   public async transcribe(
-//     waveform: Float32Array | number[],
-//     options: DecodingOptions = {}
-//   ): Promise<string> {
-//     this.validateOptions(options);
-
-//     if (Array.isArray(waveform)) {
-//       Logger.info(
-//         'Passing waveform as number[] is deprecated, use Float32Array instead'
-//       );
-//       waveform = new Float32Array(waveform);
-//     }
-//     const transcriptionBytes = await this.nativeModule.transcribe(
-//       waveform,
-//       options.language || ''
-//     );
-//     return this.textDecoder.decode(new Uint8Array(transcriptionBytes));
-//   }
-
-//   public async *stream(
-//     options: DecodingOptions = {}
-//   ): AsyncGenerator<{ committed: string; nonCommitted: string }> {
-//     this.validateOptions(options);
-
-//     const queue: { committed: string; nonCommitted: string }[] = [];
-//     let waiter: (() => void) | null = null;
-//     let finished = false;
-//     let error: unknown;
-
-//     const wake = () => {
-//       waiter?.();
-//       waiter = null;
-//     };
-
-//     (async () => {
-//       try {
-//         await this.nativeModule.stream(
-//           (committed: number[], nonCommitted: number[], isDone: boolean) => {
-//             queue.push({
-//               committed: this.textDecoder.decode(new Uint8Array(committed)),
-//               nonCommitted: this.textDecoder.decode(
-//                 new Uint8Array(nonCommitted)
-//               ),
-//             });
-//             if (isDone) {
-//               finished = true;
-//             }
-//             wake();
-//           },
-//           options.language || ''
-//         );
-//         finished = true;
-//         wake();
-//       } catch (e) {
-//         error = e;
-//         finished = true;
-//         wake();
-//       }
-//     })();
-
-//     while (true) {
-//       if (queue.length > 0) {
-//         yield queue.shift()!;
-//         if (finished && queue.length === 0) {
-//           return;
-//         }
-//         continue;
-//       }
-//       if (error) throw parseUnknownError(error);
-//       if (finished) return;
-//       await new Promise<void>((r) => (waiter = r));
-//     }
-//   }
-
-//   public streamInsert(waveform: Float32Array | number[]): void {
-//     if (Array.isArray(waveform)) {
-//       Logger.info(
-//         'Passing waveform as number[] is deprecated, use Float32Array instead'
-//       );
-//       waveform = new Float32Array(waveform);
-//     }
-//     this.nativeModule.streamInsert(waveform);
-//   }
-
-//   public streamStop(): void {
-//     this.nativeModule.streamStop();
-//   }
-
-//   private validateOptions(options: DecodingOptions) {
-//     if (!this.modelConfig.isMultilingual && options.language) {
-//       throw new RnExecutorchError(
-//         RnExecutorchErrorCode.InvalidConfig,
-//         'Model is not multilingual, cannot set language'
-//       );
-//     }
-//     if (this.modelConfig.isMultilingual && !options.language) {
-//       throw new RnExecutorchError(
-//         RnExecutorchErrorCode.InvalidConfig,
-//         'Model is multilingual, provide a language'
-//       );
-//     }
-//   }
-// }
-
 import { Logger } from '../../common/Logger';
 import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt';
 import { ResourceFetcher } from '../../utils/ResourceFetcher';
@@ -206,7 +15,10 @@ export class SpeechToTextModule {
   private nativeModule: any;
   private modelConfig!: SpeechToTextModelConfig;
 
-  // 2. TextDecoder is removed as C++ now returns JS objects directly
+  private textDecoder = new TextDecoder('utf-8', {
+    fatal: false,
+    ignoreBOM: true,
+  });
 
   public async load(
     model: SpeechToTextModelConfig,
@@ -279,11 +91,20 @@ export class SpeechToTextModule {
     );
   }
 
-  // 3. Update transcribe to return Word[] instead of string
+  public async transcribe(
+    waveform: Float32Array | number[],
+    options?: DecodingOptions & { enableTimestamps: true }
+  ): Promise<Word[]>;
+
+  public async transcribe(
+    waveform: Float32Array | number[],
+    options?: DecodingOptions & { enableTimestamps?: false | undefined }
+  ): Promise<string>;
+
   public async transcribe(
     waveform: Float32Array | number[],
     options: DecodingOptions = {}
-  ): Promise<Word[]> {
+  ): Promise<string | Word[]> {
     this.validateOptions(options);
 
     if (Array.isArray(waveform)) {
@@ -293,22 +114,45 @@ export class SpeechToTextModule {
       waveform = new Float32Array(waveform);
     }
 
-    // The native module now returns an Array of Objects, not bytes
-    const transcription: Word[] = await this.nativeModule.transcribe(
-      waveform,
-      options.language || ''
-    );
+    const language = options.language || '';
+
+    if (options.enableTimestamps) {
+      return await this.nativeModule.transcribe(waveform, language);
+    } else {
+      const transcriptionBytes = await this.nativeModule.transcribeStringOnly(
+        waveform,
+        language
+      );
 
-    return transcription;
+      return this.textDecoder.decode(new Uint8Array(transcriptionBytes));
+    }
   }
 
-  // 4. Update stream to yield Word[] structure
+  public stream(
+    options: DecodingOptions & { enableTimestamps: true }
+  ): AsyncGenerator<{ committed: Word[]; nonCommitted: Word[] }>;
+
+  public stream(
+    options?: DecodingOptions & { enableTimestamps?: false | undefined }
+  ): AsyncGenerator<{ committed: string; nonCommitted: string }>;
+
   public async *stream(
     options: DecodingOptions = {}
-  ): AsyncGenerator<{ committed: Word[]; nonCommitted: Word[] }> {
+  ): AsyncGenerator<{
+    committed: string | Word[];
+    nonCommitted: string | Word[];
+  }> {
+    console.log('[4] Module: Entered stream method');
     this.validateOptions(options);
 
-    const queue: { committed: Word[]; nonCommitted: Word[] }[] = [];
+    // Ensure we strictly default to false
+    const enableTimestamps = options.enableTimestamps === true;
+
+    const queue: {
+      committed: string | Word[];
+      nonCommitted: string | Word[];
+    }[] = [];
+
     let waiter: (() => void) | null = null;
     let finished = false;
     let error: unknown;
@@ -320,20 +164,34 @@ export class SpeechToTextModule {
 
     (async () => {
       try {
-        await this.nativeModule.stream(
-          // Callback now receives arrays of objects directly
-          (committed: Word[], nonCommitted: Word[], isDone: boolean) => {
-            queue.push({
-              committed,
-              nonCommitted,
-            });
-            if (isDone) {
-              finished = true;
+        const callback = (
+          committed: any,
+          nonCommitted: any,
+          isDone: boolean
+        ) => {
+          if (!enableTimestamps) {
+            try {
+              queue.push({
+                committed: this.textDecoder.decode(new Uint8Array(committed)),
+                nonCommitted: this.textDecoder.decode(
+                  new Uint8Array(nonCommitted)
+                ),
+              });
+            } catch (err) {
+              console.error('[Stream Decode Error]', err);
             }
-            wake();
-          },
-          options.language || ''
-        );
+          } else {
+            queue.push({ committed, nonCommitted });
+          }
+
+          if (isDone) finished = true;
+          wake();
+        };
+
+        const language = options.language || '';
+
+        await this.nativeModule.stream(callback, language, enableTimestamps);
+
         finished = true;
         wake();
       } catch (e) {
diff --git a/packages/react-native-executorch/src/types/stt.ts b/packages/react-native-executorch/src/types/stt.ts
index 20627ca11..8f95eb16d 100644
--- a/packages/react-native-executorch/src/types/stt.ts
+++ b/packages/react-native-executorch/src/types/stt.ts
@@ -80,6 +80,7 @@ export type SpeechToTextLanguage =
 
 export interface DecodingOptions {
   language?: SpeechToTextLanguage;
+  enableTimestamps?: boolean;
 }
 
 export interface SpeechToTextModelConfig {

From 9846acbe1f6099355b74a5874cc4985fac8afeb7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 21 Jan 2026 15:54:28 +0100
Subject: [PATCH 05/14] Clear files

---
 apps/speech/screens/SpeechToTextScreen.tsx    | 640 +-----------------
 .../host_objects/JsiConversions.h             |   4 -
 .../models/speech_to_text/SpeechToText.cpp    |  70 --
 .../useSpeechToText.ts                        | 295 --------
 .../SpeechToTextModule.ts                     |   3 -
 5 files changed, 5 insertions(+), 1007 deletions(-)

diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index 9dab4420b..f844241f3 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -1,617 +1,3 @@
-// import React, { useEffect, useRef, useState } from 'react';
-// import {
-//   Text,
-//   View,
-//   StyleSheet,
-//   TouchableOpacity,
-//   ScrollView,
-//   TextInput,
-//   KeyboardAvoidingView,
-//   Platform,
-// } from 'react-native';
-// import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
-// import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch';
-// import FontAwesome from '@expo/vector-icons/FontAwesome';
-// import {
-//   AudioManager,
-//   AudioRecorder,
-//   AudioContext,
-// } from 'react-native-audio-api';
-// import * as FileSystem from 'expo-file-system/legacy';
-// import SWMIcon from '../assets/swm_icon.svg';
-// import DeviceInfo from 'react-native-device-info';
-
-// const isSimulator = DeviceInfo.isEmulatorSync();
-
-// export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
-//   const model = useSpeechToText({
-//     model: WHISPER_TINY_EN,
-//   });
-
-//   const [transcription, setTranscription] = useState('');
-//   const [audioURL, setAudioURL] = useState('');
-//   const [liveTranscribing, setLiveTranscribing] = useState(false);
-//   const scrollViewRef = useRef<ScrollView>(null);
-
-//   const [recorder] = useState(
-//     () =>
-//       new AudioRecorder({
-//         sampleRate: 16000,
-//         bufferLengthInSamples: 1600,
-//       })
-//   );
-
-//   useEffect(() => {
-//     AudioManager.setAudioSessionOptions({
-//       iosCategory: 'playAndRecord',
-//       iosMode: 'spokenAudio',
-//       iosOptions: ['allowBluetooth', 'defaultToSpeaker'],
-//     });
-//     AudioManager.requestRecordingPermissions();
-//   }, []);
-
-//   const handleTranscribeFromURL = async () => {
-//     if (!audioURL.trim()) {
-//       console.warn('Please provide a valid audio file URL');
-//       return;
-//     }
-
-//     const { uri } = await FileSystem.downloadAsync(
-//       audioURL,
-//       FileSystem.cacheDirectory + 'audio_file'
-//     );
-
-//     const audioContext = new AudioContext({ sampleRate: 16000 });
-
-//     try {
-//       const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
-//       const audioBuffer = decodedAudioData.getChannelData(0);
-//       setTranscription(await model.transcribe(audioBuffer));
-//     } catch (error) {
-//       console.error('Error decoding audio data', error);
-//       console.warn('Note: Supported file formats: mp3, wav, flac');
-//       return;
-//     }
-//   };
-
-//   const handleStartTranscribeFromMicrophone = async () => {
-//     setLiveTranscribing(true);
-//     setTranscription('');
-//     recorder.onAudioReady(({ buffer }) => {
-//       model.streamInsert(buffer.getChannelData(0));
-//     });
-//     recorder.start();
-
-//     try {
-//       await model.stream();
-//     } catch (error) {
-//       console.error('Error during live transcription:', error);
-//     }
-//   };
-
-//   const handleStopTranscribeFromMicrophone = () => {
-//     recorder.stop();
-//     model.streamStop();
-//     console.log('Live transcription stopped');
-//     setLiveTranscribing(false);
-//   };
-
-//   const getModelStatus = () => {
-//     if (model.error) return `${model.error}`;
-//     if (model.isGenerating) return 'Transcribing...';
-//     if (model.isReady) return 'Ready to transcribe';
-//     return `Loading model: ${(100 * model.downloadProgress).toFixed(2)}%`;
-//   };
-
-//   const readyToTranscribe = !model.isGenerating && model.isReady;
-//   const recordingButtonDisabled = isSimulator || !readyToTranscribe;
-
-//   return (
-//     <SafeAreaProvider>
-//       <SafeAreaView style={styles.container}>
-//         <KeyboardAvoidingView
-//           style={styles.keyboardAvoidingView}
-//           behavior={Platform.OS === 'ios' ? 'padding' : undefined}
-//         >
-//           <View style={styles.header}>
-//             <TouchableOpacity style={styles.backButton} onPress={onBack}>
-//               <FontAwesome name="chevron-left" size={20} color="#0f186e" />
-//             </TouchableOpacity>
-//             <SWMIcon width={60} height={60} />
-//             <Text style={styles.headerText}>React Native ExecuTorch</Text>
-//             <Text style={styles.headerText}>Speech to Text</Text>
-//           </View>
-
-//           <View style={styles.statusContainer}>
-//             <Text>Status: {getModelStatus()}</Text>
-//           </View>
-
-//           <View style={styles.transcriptionContainer}>
-//             <Text style={styles.transcriptionLabel}>Transcription</Text>
-//             <ScrollView
-//               ref={scrollViewRef}
-//               style={styles.transcriptionScrollContainer}
-//               onContentSizeChange={() =>
-//                 scrollViewRef.current?.scrollToEnd({ animated: true })
-//               }
-//             >
-//               <Text>
-//                 {transcription !== ''
-//                   ? transcription
-//                   : model.committedTranscription +
-//                     model.nonCommittedTranscription}
-//               </Text>
-//             </ScrollView>
-//           </View>
-
-//           <View style={styles.inputContainer}>
-//             <View style={styles.urlTranscriptionContainer}>
-//               <TextInput
-//                 placeholder="Audio file URL to transcribe"
-//                 style={styles.urlTranscriptionInput}
-//                 value={audioURL}
-//                 onChangeText={setAudioURL}
-//               />
-//               <TouchableOpacity
-//                 disabled={!readyToTranscribe}
-//                 onPress={handleTranscribeFromURL}
-//                 style={[
-//                   styles.urlTranscriptionButton,
-//                   !readyToTranscribe && styles.disabled,
-//                 ]}
-//               >
-//                 <Text style={styles.buttonText}>Start</Text>
-//               </TouchableOpacity>
-//             </View>
-
-//             {liveTranscribing ? (
-//               <TouchableOpacity
-//                 onPress={handleStopTranscribeFromMicrophone}
-//                 style={[styles.liveTranscriptionButton, styles.backgroundRed]}
-//               >
-//                 <FontAwesome name="microphone-slash" size={22} color="white" />
-//                 <Text style={styles.buttonText}> Stop Live Transcription</Text>
-//               </TouchableOpacity>
-//             ) : (
-//               <TouchableOpacity
-//                 disabled={recordingButtonDisabled}
-//                 onPress={handleStartTranscribeFromMicrophone}
-//                 style={[
-//                   styles.liveTranscriptionButton,
-//                   styles.backgroundBlue,
-//                   recordingButtonDisabled && styles.disabled,
-//                 ]}
-//               >
-//                 <FontAwesome name="microphone" size={20} color="white" />
-//                 <Text style={styles.buttonText}>
-//                   {isSimulator
-//                     ? 'Recording is not available on Simulator'
-//                     : 'Start Live Transcription'}
-//                 </Text>
-//               </TouchableOpacity>
-//             )}
-//           </View>
-//         </KeyboardAvoidingView>
-//       </SafeAreaView>
-//     </SafeAreaProvider>
-//   );
-// };
-
-// const styles = StyleSheet.create({
-//   container: {
-//     flex: 1,
-//     alignItems: 'center',
-//     backgroundColor: 'white',
-//     paddingHorizontal: 16,
-//   },
-//   keyboardAvoidingView: {
-//     flex: 1,
-//     width: '100%',
-//   },
-//   header: {
-//     alignItems: 'center',
-//     position: 'relative',
-//     width: '100%',
-//   },
-//   backButton: {
-//     position: 'absolute',
-//     left: 0,
-//     top: 10,
-//     padding: 10,
-//     zIndex: 1,
-//   },
-//   headerText: {
-//     fontSize: 22,
-//     fontWeight: 'bold',
-//     color: '#0f186e',
-//   },
-//   statusContainer: {
-//     marginTop: 12,
-//     alignItems: 'center',
-//   },
-//   transcriptionContainer: {
-//     flex: 1,
-//     width: '100%',
-//     marginVertical: 12,
-//   },
-//   transcriptionLabel: {
-//     marginLeft: 12,
-//     marginBottom: 4,
-//     color: '#0f186e',
-//   },
-//   transcriptionScrollContainer: {
-//     borderRadius: 12,
-//     borderWidth: 1,
-//     borderColor: '#0f186e',
-//     padding: 12,
-//   },
-//   inputContainer: {
-//     marginBottom: 12,
-//   },
-//   urlTranscriptionContainer: {
-//     width: '100%',
-//     flexDirection: 'row',
-//   },
-//   urlTranscriptionInput: {
-//     flex: 1,
-//     padding: 12,
-//     borderTopLeftRadius: 12,
-//     borderBottomLeftRadius: 12,
-//     borderWidth: 1,
-//     borderColor: '#0f186e',
-//     borderRightWidth: 0,
-//   },
-//   urlTranscriptionButton: {
-//     backgroundColor: '#0f186e',
-//     justifyContent: 'center',
-//     alignItems: 'center',
-//     padding: 12,
-//     borderTopRightRadius: 12,
-//     borderBottomRightRadius: 12,
-//   },
-//   buttonText: {
-//     color: 'white',
-//     fontWeight: '600',
-//     letterSpacing: -0.5,
-//     fontSize: 16,
-//   },
-//   liveTranscriptionButton: {
-//     flexDirection: 'row',
-//     justifyContent: 'center',
-//     alignItems: 'center',
-//     padding: 12,
-//     borderRadius: 12,
-//     marginTop: 12,
-//     gap: 8,
-//   },
-//   backgroundRed: {
-//     backgroundColor: 'red',
-//   },
-//   backgroundBlue: {
-//     backgroundColor: '#0f186e',
-//   },
-//   disabled: {
-//     opacity: 0.5,
-//   },
-// });
-
-// import React, { useEffect, useRef, useState } from 'react';
-// import {
-//   Text,
-//   View,
-//   StyleSheet,
-//   TouchableOpacity,
-//   ScrollView,
-//   TextInput,
-//   KeyboardAvoidingView,
-//   Platform,
-// } from 'react-native';
-// import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
-// import {
-//   useSpeechToText,
-//   WHISPER_TINY_EN,
-//   // Make sure Word is exported from your module
-//   Word,
-// } from 'react-native-executorch';
-// import FontAwesome from '@expo/vector-icons/FontAwesome';
-// import {
-//   AudioManager,
-//   AudioRecorder,
-//   AudioContext,
-// } from 'react-native-audio-api';
-// import * as FileSystem from 'expo-file-system/legacy';
-// import SWMIcon from '../assets/swm_icon.svg';
-// import DeviceInfo from 'react-native-device-info';
-
-// const isSimulator = DeviceInfo.isEmulatorSync();
-
-// export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
-//   const model = useSpeechToText({
-//     model: WHISPER_TINY_EN,
-//   });
-
-//   // CHANGE 1: Update state to hold Word[] instead of string
-//   const [transcription, setTranscription] = useState<Word[]>([]);
-
-//   const [audioURL, setAudioURL] = useState('');
-//   const [liveTranscribing, setLiveTranscribing] = useState(false);
-//   const scrollViewRef = useRef<ScrollView>(null);
-
-//   const [recorder] = useState(
-//     () =>
-//       new AudioRecorder({
-//         sampleRate: 16000,
-//         bufferLengthInSamples: 1600,
-//       })
-//   );
-
-//   useEffect(() => {
-//     AudioManager.setAudioSessionOptions({
-//       iosCategory: 'playAndRecord',
-//       iosMode: 'spokenAudio',
-//       iosOptions: ['allowBluetooth', 'defaultToSpeaker'],
-//     });
-//     AudioManager.requestRecordingPermissions();
-//   }, []);
-
-//   const getText = (words: Word[]) => {
-//     return words
-//       .map((w) => {
-//         // Format: "hello (0.00s - 0.50s) "
-//         // using toFixed(2) for cleaner timestamp display
-//         return `${w.word} (${w.start.toFixed(2)}s - ${w.end.toFixed(2)}s)\n`;
-//       })
-//       .join('');
-//   };
-
-//   const handleTranscribeFromURL = async () => {
-//     if (!audioURL.trim()) {
-//       console.warn('Please provide a valid audio file URL');
-//       return;
-//     }
-
-//     const { uri } = await FileSystem.downloadAsync(
-//       audioURL,
-//       FileSystem.cacheDirectory + 'audio_file'
-//     );
-
-//     const audioContext = new AudioContext({ sampleRate: 16000 });
-
-//     try {
-//       const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
-//       const audioBuffer = decodedAudioData.getChannelData(0);
-//       // model.transcribe now returns Word[], which matches our state type
-//       setTranscription(await model.transcribe(audioBuffer));
-//     } catch (error) {
-//       console.error('Error decoding audio data', error);
-//       console.warn('Note: Supported file formats: mp3, wav, flac');
-//       return;
-//     }
-//   };
-
-//   const handleStartTranscribeFromMicrophone = async () => {
-//     setLiveTranscribing(true);
-//     setTranscription([]); // Reset to empty array
-//     recorder.onAudioReady(({ buffer }) => {
-//       model.streamInsert(buffer.getChannelData(0));
-//     });
-//     recorder.start();
-
-//     try {
-//       await model.stream();
-//     } catch (error) {
-//       console.error('Error during live transcription:', error);
-//     }
-//   };
-
-//   const handleStopTranscribeFromMicrophone = () => {
-//     recorder.stop();
-//     model.streamStop();
-//     console.log('Live transcription stopped');
-//     setLiveTranscribing(false);
-//   };
-
-//   const getModelStatus = () => {
-//     if (model.error) return `${model.error}`;
-//     if (model.isGenerating) return 'Transcribing...';
-//     if (model.isReady) return 'Ready to transcribe';
-//     return `Loading model: ${(100 * model.downloadProgress).toFixed(2)}%`;
-//   };
-
-//   const readyToTranscribe = !model.isGenerating && model.isReady;
-//   const recordingButtonDisabled = isSimulator || !readyToTranscribe;
-
-//   // CHANGE 3: Prepare the text for rendering
-//   const displayedText =
-//     transcription.length > 0
-//       ? getText(transcription)
-//       : getText(model.committedTranscription) +
-//         getText(model.nonCommittedTranscription);
-
-//   return (
-//     <SafeAreaProvider>
-//       <SafeAreaView style={styles.container}>
-//         <KeyboardAvoidingView
-//           style={styles.keyboardAvoidingView}
-//           behavior={Platform.OS === 'ios' ? 'padding' : undefined}
-//         >
-//           <View style={styles.header}>
-//             <TouchableOpacity style={styles.backButton} onPress={onBack}>
-//               <FontAwesome name="chevron-left" size={20} color="#0f186e" />
-//             </TouchableOpacity>
-//             <SWMIcon width={60} height={60} />
-//             <Text style={styles.headerText}>React Native ExecuTorch</Text>
-//             <Text style={styles.headerText}>Speech to Text</Text>
-//           </View>
-
-//           <View style={styles.statusContainer}>
-//             <Text>Status: {getModelStatus()}</Text>
-//           </View>
-
-//           <View style={styles.transcriptionContainer}>
-//             <Text style={styles.transcriptionLabel}>Transcription</Text>
-//             <ScrollView
-//               ref={scrollViewRef}
-//               style={styles.transcriptionScrollContainer}
-//               onContentSizeChange={() =>
-//                 scrollViewRef.current?.scrollToEnd({ animated: true })
-//               }
-//             >
-//               <Text>{displayedText}</Text>
-//             </ScrollView>
-//           </View>
-
-//           <View style={styles.inputContainer}>
-//             <View style={styles.urlTranscriptionContainer}>
-//               <TextInput
-//                 placeholder="Audio file URL to transcribe"
-//                 style={styles.urlTranscriptionInput}
-//                 value={audioURL}
-//                 onChangeText={setAudioURL}
-//               />
-//               <TouchableOpacity
-//                 disabled={!readyToTranscribe}
-//                 onPress={handleTranscribeFromURL}
-//                 style={[
-//                   styles.urlTranscriptionButton,
-//                   !readyToTranscribe && styles.disabled,
-//                 ]}
-//               >
-//                 <Text style={styles.buttonText}>Start</Text>
-//               </TouchableOpacity>
-//             </View>
-
-//             {liveTranscribing ? (
-//               <TouchableOpacity
-//                 onPress={handleStopTranscribeFromMicrophone}
-//                 style={[styles.liveTranscriptionButton, styles.backgroundRed]}
-//               >
-//                 <FontAwesome name="microphone-slash" size={22} color="white" />
-//                 <Text style={styles.buttonText}> Stop Live Transcription</Text>
-//               </TouchableOpacity>
-//             ) : (
-//               <TouchableOpacity
-//                 disabled={recordingButtonDisabled}
-//                 onPress={handleStartTranscribeFromMicrophone}
-//                 style={[
-//                   styles.liveTranscriptionButton,
-//                   styles.backgroundBlue,
-//                   recordingButtonDisabled && styles.disabled,
-//                 ]}
-//               >
-//                 <FontAwesome name="microphone" size={20} color="white" />
-//                 <Text style={styles.buttonText}>
-//                   {isSimulator
-//                     ? 'Recording is not available on Simulator'
-//                     : 'Start Live Transcription'}
-//                 </Text>
-//               </TouchableOpacity>
-//             )}
-//           </View>
-//         </KeyboardAvoidingView>
-//       </SafeAreaView>
-//     </SafeAreaProvider>
-//   );
-// };
-
-// const styles = StyleSheet.create({
-//   container: {
-//     flex: 1,
-//     alignItems: 'center',
-//     backgroundColor: 'white',
-//     paddingHorizontal: 16,
-//   },
-//   keyboardAvoidingView: {
-//     flex: 1,
-//     width: '100%',
-//   },
-//   header: {
-//     alignItems: 'center',
-//     position: 'relative',
-//     width: '100%',
-//   },
-//   backButton: {
-//     position: 'absolute',
-//     left: 0,
-//     top: 10,
-//     padding: 10,
-//     zIndex: 1,
-//   },
-//   headerText: {
-//     fontSize: 22,
-//     fontWeight: 'bold',
-//     color: '#0f186e',
-//   },
-//   statusContainer: {
-//     marginTop: 12,
-//     alignItems: 'center',
-//   },
-//   transcriptionContainer: {
-//     flex: 1,
-//     width: '100%',
-//     marginVertical: 12,
-//   },
-//   transcriptionLabel: {
-//     marginLeft: 12,
-//     marginBottom: 4,
-//     color: '#0f186e',
-//   },
-//   transcriptionScrollContainer: {
-//     borderRadius: 12,
-//     borderWidth: 1,
-//     borderColor: '#0f186e',
-//     padding: 12,
-//   },
-//   inputContainer: {
-//     marginBottom: 12,
-//   },
-//   urlTranscriptionContainer: {
-//     width: '100%',
-//     flexDirection: 'row',
-//   },
-//   urlTranscriptionInput: {
-//     flex: 1,
-//     padding: 12,
-//     borderTopLeftRadius: 12,
-//     borderBottomLeftRadius: 12,
-//     borderWidth: 1,
-//     borderColor: '#0f186e',
-//     borderRightWidth: 0,
-//   },
-//   urlTranscriptionButton: {
-//     backgroundColor: '#0f186e',
-//     justifyContent: 'center',
-//     alignItems: 'center',
-//     padding: 12,
-//     borderTopRightRadius: 12,
-//     borderBottomRightRadius: 12,
-//   },
-//   buttonText: {
-//     color: 'white',
-//     fontWeight: '600',
-//     letterSpacing: -0.5,
-//     fontSize: 16,
-//   },
-//   liveTranscriptionButton: {
-//     flexDirection: 'row',
-//     justifyContent: 'center',
-//     alignItems: 'center',
-//     padding: 12,
-//     borderRadius: 12,
-//     marginTop: 12,
-//     gap: 8,
-//   },
-//   backgroundRed: {
-//     backgroundColor: 'red',
-//   },
-//   backgroundBlue: {
-//     backgroundColor: '#0f186e',
-//   },
-//   disabled: {
-//     opacity: 0.5,
-//   },
-// });
-
 import React, { useEffect, useRef, useState } from 'react';
 import {
   Text,
@@ -622,7 +8,7 @@ import {
   TextInput,
   KeyboardAvoidingView,
   Platform,
-  Switch, // Import Switch
+  Switch,
 } from 'react-native';
 import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
 import {
@@ -647,10 +33,8 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     model: WHISPER_TINY_EN,
   });
 
-  // CHANGE 1: State can now be string OR Word[]
   const [transcription, setTranscription] = useState<string | Word[]>('');
 
-  // CHANGE 2: Add toggle for timestamps
   const [enableTimestamps, setEnableTimestamps] = useState(false);
 
   const [audioURL, setAudioURL] = useState('');
@@ -674,20 +58,16 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     AudioManager.requestRecordingPermissions();
   }, []);
 
-  // CHANGE 3: Smart helper that handles both formats
   const getText = (data: string | Word[] | undefined) => {
-    console.log('UI Received:', JSON.stringify(data));
     if (!data) return '';
     if (typeof data === 'string') return data;
 
-    // It's Word[], format with timestamps
     return data
       .map((w) => `${w.word} (${w.start.toFixed(2)}s - ${w.end.toFixed(2)}s)\n`)
       .join('');
   };
 
   const handleTranscribeFromURL = async () => {
-    console.log('[1] UI: Button Pressed. Calling model.stream()...');
     if (!audioURL.trim()) {
       console.warn('Please provide a valid audio file URL');
       return;
@@ -708,12 +88,12 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
       // TypeScript will infer the return type based on the flag
       if (enableTimestamps) {
         const result = await model.transcribe(audioBuffer, {
-          enableTimestamps: true,
+          enableTimestamps: true
         });
         setTranscription(result);
       } else {
         const result = await model.transcribe(audioBuffer, {
-          enableTimestamps: false,
+          enableTimestamps: false
         });
         setTranscription(result);
       }
@@ -726,7 +106,6 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
 
   const handleStartTranscribeFromMicrophone = async () => {
     setLiveTranscribing(true);
-    // Reset based on mode
     setTranscription(enableTimestamps ? [] : '');
 
     recorder.onAudioReady(({ buffer }) => {
@@ -735,12 +114,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     recorder.start();
 
     try {
-      // CHANGE 5: Pass the toggle flag to stream
-      if (enableTimestamps) {
-        await model.stream({ enableTimestamps: true });
-      } else {
-        await model.stream({ enableTimestamps: false });
-      }
+      await model.stream({ enableTimestamps: enableTimestamps });
     } catch (error) {
       console.error('Error during live transcription:', error);
     }
@@ -763,11 +137,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
   const readyToTranscribe = !model.isGenerating && model.isReady;
   const recordingButtonDisabled = isSimulator || !readyToTranscribe;
 
-  // CHANGE 6: Logic to choose what text to display
-  // We use getText() on everything so it converts Arrays to Strings before concatenation
-  const hasResult = Array.isArray(transcription)
-    ? transcription.length > 0
-    : transcription.length > 0;
+  const hasResult = transcription.length > 0;
 
   const displayedText = hasResult
     ? getText(transcription)
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index df7f635d9..95da364ff 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -69,15 +69,11 @@ template <>
 inline Word getValue<Word>(const jsi::Value &val, jsi::Runtime &runtime) {
   jsi::Object obj = val.asObject(runtime);
   
-  // 1. Extract the string "word" using the existing string helper
   std::string content = getValue<std::string>(obj.getProperty(runtime, "word"), runtime);
   
-  // 2. Extract start/end times
-  // We use .asNumber() directly as these are primitives
   double start = obj.getProperty(runtime, "start").asNumber();
   double end = obj.getProperty(runtime, "end").asNumber();
 
-  // 3. Construct and return the C++ Word struct
   return Word{
       .content = std::move(content),
       .start = static_cast<float>(start),
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
index 68e63d612..aa9980bd8 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
@@ -43,30 +43,6 @@ SpeechToText::decode(std::span<int32_t> tokens,
   return std::make_shared<OwningArrayBuffer>(decoderOutput);
 }
 
-// std::vector<char> SpeechToText::transcribe(std::span<float> waveform,
-//                                            std::string languageOption) const
-//                                            {
-//   std::vector<Segment> segments =
-//       this->asr->transcribe(waveform, DecodingOptions(languageOption));
-//   std::string transcription;
-
-//   size_t transcriptionLength = 0;
-//   for (auto &segment : segments) {
-//     for (auto &word : segment.words) {
-//       transcriptionLength += word.content.size();
-//     }
-//   }
-//   transcription.reserve(transcriptionLength);
-
-//   for (auto &segment : segments) {
-//     for (auto &word : segment.words) {
-//       transcription += word.content;
-//     }
-//   }
-
-//   return {transcription.begin(), transcription.end()};
-// }
-
 std::vector<Word> SpeechToText::transcribe(std::span<float> waveform,
                                            std::string languageOption) const {
   std::vector<Segment> segments =
@@ -134,51 +110,6 @@ size_t SpeechToText::getMemoryLowerBound() const noexcept {
          this->decoder->getMemoryLowerBound();
 }
 
-// void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
-//                           std::string languageOption) {
-//   if (this->isStreaming) {
-//     throw RnExecutorchError(RnExecutorchErrorCode::StreamingInProgress,
-//                             "Streaming is already in progress!");
-//   }
-
-//   auto nativeCallback =
-//       [this, callback](const std::vector<char> &committedVec,
-//                        const std::vector<char> &nonCommittedVec, bool isDone)
-//                        {
-//         this->callInvoker->invokeAsync([callback, committedVec,
-//         nonCommittedVec,
-//                                         isDone](jsi::Runtime &rt) {
-//           callback->call(
-//               rt, rnexecutorch::jsi_conversion::getJsiValue(committedVec,
-//               rt), rnexecutorch::jsi_conversion::getJsiValue(nonCommittedVec,
-//               rt), jsi::Value(isDone));
-//         });
-//       };
-
-//   this->isStreaming = true;
-//   while (this->isStreaming) {
-//     if (!this->readyToProcess ||
-//         this->processor->audioBuffer.size() < SpeechToText::kMinAudioSamples)
-//         {
-//       std::this_thread::sleep_for(std::chrono::milliseconds(100));
-//       continue;
-//     }
-//     ProcessResult res =
-//         this->processor->processIter(DecodingOptions(languageOption));
-
-//     nativeCallback({res.committed.begin(), res.committed.end()},
-//                    {res.nonCommitted.begin(), res.nonCommitted.end()},
-//                    false);
-//     this->readyToProcess = false;
-//   }
-
-//   std::string committed = this->processor->finish();
-
-//   nativeCallback({committed.begin(), committed.end()}, {}, true);
-
-//   this->resetStreamState();
-// }
-
 void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
                           std::string languageOption, bool enableTimestamps) {
   if (this->isStreaming) {
@@ -220,7 +151,6 @@ void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
     this->readyToProcess = false;
   }
 
-  // finish() now returns std::vector<Word>
   std::vector<Word> committed = this->processor->finish();
 
   if (enableTimestamps) {
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
index 7f42d33cb..40801c7b3 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -1,298 +1,3 @@
-// import { useEffect, useCallback, useState } from 'react';
-// import { SpeechToTextModule, Word } from '../../modules/natural_language_processing/SpeechToTextModule';
-// import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt';
-// import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
-// import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
-
-// export const useSpeechToText = ({
-//   model,
-//   preventLoad = false,
-// }: {
-//   model: SpeechToTextModelConfig;
-//   preventLoad?: boolean;
-// }) => {
-//   const [error, setError] = useState<null | RnExecutorchError>(null);
-//   const [isReady, setIsReady] = useState(false);
-//   const [isGenerating, setIsGenerating] = useState(false);
-//   const [downloadProgress, setDownloadProgress] = useState(0);
-
-//   const [modelInstance] = useState(() => new SpeechToTextModule());
-//   const [committedTranscription, setCommittedTranscription] = useState(Word);
-//   const [nonCommittedTranscription, setNonCommittedTranscription] =
-//     useState(Word);
-
-//   useEffect(() => {
-//     if (preventLoad) return;
-//     (async () => {
-//       setDownloadProgress(0);
-//       setError(null);
-//       try {
-//         setIsReady(false);
-//         await modelInstance.load(
-//           {
-//             isMultilingual: model.isMultilingual,
-//             encoderSource: model.encoderSource,
-//             decoderSource: model.decoderSource,
-//             tokenizerSource: model.tokenizerSource,
-//           },
-//           setDownloadProgress
-//         );
-//         setIsReady(true);
-//       } catch (err) {
-//         setError(parseUnknownError(err));
-//       }
-//     })();
-//   }, [
-//     modelInstance,
-//     model.isMultilingual,
-//     model.encoderSource,
-//     model.decoderSource,
-//     model.tokenizerSource,
-//     preventLoad,
-//   ]);
-
-//   const stateWrapper = useCallback(
-//     <T extends (...args: any[]) => Promise<any>>(fn: T) =>
-//       async (...args: Parameters<T>): Promise<Awaited<ReturnType<T>>> => {
-//         if (!isReady)
-//           throw new RnExecutorchError(
-//             RnExecutorchErrorCode.ModuleNotLoaded,
-//             'The model is currently not loaded. Please load the model before calling this function.'
-//           );
-//         if (isGenerating)
-//           throw new RnExecutorchError(
-//             RnExecutorchErrorCode.ModelGenerating,
-//             'The model is currently generating. Please wait until previous model run is complete.'
-//           );
-//         setIsGenerating(true);
-//         try {
-//           return await fn.apply(modelInstance, args);
-//         } finally {
-//           setIsGenerating(false);
-//         }
-//       },
-//     [isReady, isGenerating, modelInstance]
-//   );
-
-//   const stream = useCallback(
-//     async (options?: DecodingOptions) => {
-//       if (!isReady)
-//         throw new RnExecutorchError(
-//           RnExecutorchErrorCode.ModuleNotLoaded,
-//           'The model is currently not loaded. Please load the model before calling this function.'
-//         );
-//       if (isGenerating)
-//         throw new RnExecutorchError(
-//           RnExecutorchErrorCode.ModelGenerating,
-//           'The model is currently generating. Please wait until previous model run is complete.'
-//         );
-//       setIsGenerating(true);
-//       setCommittedTranscription('');
-//       setNonCommittedTranscription('');
-//       let transcription = '';
-//       try {
-//         for await (const { committed, nonCommitted } of modelInstance.stream(
-//           options
-//         )) {
-//           setCommittedTranscription((prev) => prev + committed);
-//           setNonCommittedTranscription(nonCommitted);
-//           transcription += committed;
-//         }
-//       } finally {
-//         setIsGenerating(false);
-//       }
-//       return transcription;
-//     },
-//     [isReady, isGenerating, modelInstance]
-//   );
-
-//   const wrapper = useCallback(
-//     <T extends (...args: any[]) => any>(fn: T) => {
-//       return (...args: Parameters<T>): ReturnType<T> => {
-//         if (!isReady)
-//           throw new RnExecutorchError(
-//             RnExecutorchErrorCode.ModuleNotLoaded,
-//             'The model is currently not loaded. Please load the model before calling this function.'
-//           );
-//         return fn.apply(modelInstance, args);
-//       };
-//     },
-//     [isReady, modelInstance]
-//   );
-
-//   return {
-//     error,
-//     isReady,
-//     isGenerating,
-//     downloadProgress,
-//     committedTranscription,
-//     nonCommittedTranscription,
-//     encode: stateWrapper(SpeechToTextModule.prototype.encode),
-//     decode: stateWrapper(SpeechToTextModule.prototype.decode),
-//     transcribe: stateWrapper(SpeechToTextModule.prototype.transcribe),
-//     stream,
-//     streamStop: wrapper(SpeechToTextModule.prototype.streamStop),
-//     streamInsert: wrapper(SpeechToTextModule.prototype.streamInsert),
-//   };
-// };
-
-// import { useEffect, useCallback, useState } from 'react';
-// // Make sure Word is exported from your module file
-// import {
-//   SpeechToTextModule,
-//   Word,
-// } from '../../modules/natural_language_processing/SpeechToTextModule';
-// import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt';
-// import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
-// import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
-
-// export const useSpeechToText = ({
-//   model,
-//   preventLoad = false,
-// }: {
-//   model: SpeechToTextModelConfig;
-//   preventLoad?: boolean;
-// }) => {
-//   const [error, setError] = useState<null | RnExecutorchError>(null);
-//   const [isReady, setIsReady] = useState(false);
-//   const [isGenerating, setIsGenerating] = useState(false);
-//   const [downloadProgress, setDownloadProgress] = useState(0);
-
-//   const [modelInstance] = useState(() => new SpeechToTextModule());
-
-//   // FIX 1: Initialize with empty array [], generic type Word[]
-//   const [committedTranscription, setCommittedTranscription] = useState<Word[]>(
-//     []
-//   );
-//   const [nonCommittedTranscription, setNonCommittedTranscription] = useState<
-//     Word[]
-//   >([]);
-
-//   useEffect(() => {
-//     if (preventLoad) return;
-//     (async () => {
-//       setDownloadProgress(0);
-//       setError(null);
-//       try {
-//         setIsReady(false);
-//         await modelInstance.load(
-//           {
-//             isMultilingual: model.isMultilingual,
-//             encoderSource: model.encoderSource,
-//             decoderSource: model.decoderSource,
-//             tokenizerSource: model.tokenizerSource,
-//           },
-//           setDownloadProgress
-//         );
-//         setIsReady(true);
-//       } catch (err) {
-//         setError(parseUnknownError(err));
-//       }
-//     })();
-//   }, [
-//     modelInstance,
-//     model.isMultilingual,
-//     model.encoderSource,
-//     model.decoderSource,
-//     model.tokenizerSource,
-//     preventLoad,
-//   ]);
-
-//   const stateWrapper = useCallback(
-//     <T extends (...args: any[]) => Promise<any>>(fn: T) =>
-//       async (...args: Parameters<T>): Promise<Awaited<ReturnType<T>>> => {
-//         if (!isReady)
-//           throw new RnExecutorchError(
-//             RnExecutorchErrorCode.ModuleNotLoaded,
-//             'The model is currently not loaded. Please load the model before calling this function.'
-//           );
-//         if (isGenerating)
-//           throw new RnExecutorchError(
-//             RnExecutorchErrorCode.ModelGenerating,
-//             'The model is currently generating. Please wait until previous model run is complete.'
-//           );
-//         setIsGenerating(true);
-//         try {
-//           return await fn.apply(modelInstance, args);
-//         } finally {
-//           setIsGenerating(false);
-//         }
-//       },
-//     [isReady, isGenerating, modelInstance]
-//   );
-
-//   const stream = useCallback(
-//     async (options?: DecodingOptions) => {
-//       if (!isReady)
-//         throw new RnExecutorchError(
-//           RnExecutorchErrorCode.ModuleNotLoaded,
-//           'The model is currently not loaded. Please load the model before calling this function.'
-//         );
-//       if (isGenerating)
-//         throw new RnExecutorchError(
-//           RnExecutorchErrorCode.ModelGenerating,
-//           'The model is currently generating. Please wait until previous model run is complete.'
-//         );
-//       setIsGenerating(true);
-
-//       // FIX 2: Reset to empty arrays
-//       setCommittedTranscription([]);
-//       setNonCommittedTranscription([]);
-
-//       // Accumulator is now an array of Words, not a string
-//       const fullResult: Word[] = [];
-
-//       try {
-//         for await (const { committed, nonCommitted } of modelInstance.stream(
-//           options
-//         )) {
-//           // FIX 3: Update state by appending arrays
-//           if (committed.length > 0) {
-//             setCommittedTranscription((prev) => [...prev, ...committed]);
-//             fullResult.push(...committed);
-//           }
-
-//           // nonCommitted is always a fresh partial chunk
-//           setNonCommittedTranscription(nonCommitted);
-//         }
-//       } finally {
-//         setIsGenerating(false);
-//       }
-//       return fullResult;
-//     },
-//     [isReady, isGenerating, modelInstance]
-//   );
-
-//   const wrapper = useCallback(
-//     <T extends (...args: any[]) => any>(fn: T) => {
-//       return (...args: Parameters<T>): ReturnType<T> => {
-//         if (!isReady)
-//           throw new RnExecutorchError(
-//             RnExecutorchErrorCode.ModuleNotLoaded,
-//             'The model is currently not loaded. Please load the model before calling this function.'
-//           );
-//         return fn.apply(modelInstance, args);
-//       };
-//     },
-//     [isReady, modelInstance]
-//   );
-
-//   return {
-//     error,
-//     isReady,
-//     isGenerating,
-//     downloadProgress,
-//     committedTranscription,
-//     nonCommittedTranscription,
-//     encode: stateWrapper(SpeechToTextModule.prototype.encode),
-//     decode: stateWrapper(SpeechToTextModule.prototype.decode),
-//     transcribe: stateWrapper(SpeechToTextModule.prototype.transcribe),
-//     stream,
-//     streamStop: wrapper(SpeechToTextModule.prototype.streamStop),
-//     streamInsert: wrapper(SpeechToTextModule.prototype.streamInsert),
-//   };
-// };
-
 import { useEffect, useCallback, useState } from 'react';
 import {
   SpeechToTextModule,
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
index 5d5fd3248..1ad2f6c97 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
@@ -4,7 +4,6 @@ import { ResourceFetcher } from '../../utils/ResourceFetcher';
 import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
 import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
 
-// 1. Define the Word interface matching your C++ JSI object structure
 export interface Word {
   word: string;
   start: number;
@@ -142,10 +141,8 @@ export class SpeechToTextModule {
     committed: string | Word[];
     nonCommitted: string | Word[];
   }> {
-    console.log('[4] Module: Entered stream method');
     this.validateOptions(options);
 
-    // Ensure we strictly default to false
     const enableTimestamps = options.enableTimestamps === true;
 
     const queue: {

From 4e1ae51f2d638daf7a05454f002ad6bd3c419a96 Mon Sep 17 00:00:00 2001
From: Mateusz Sluszniak <56299341+msluszniak@users.noreply.github.com>
Date: Wed, 21 Jan 2026 16:36:46 +0100
Subject: [PATCH 06/14] Apply suggestions from code review

---
 apps/speech/screens/SpeechToTextScreen.tsx | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index f844241f3..ab3cdefb0 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -88,12 +88,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
       // TypeScript will infer the return type based on the flag
       if (enableTimestamps) {
         const result = await model.transcribe(audioBuffer, {
-          enableTimestamps: true
-        });
-        setTranscription(result);
-      } else {
-        const result = await model.transcribe(audioBuffer, {
-          enableTimestamps: false
+          enableTimestamps: enableTimestamps
         });
         setTranscription(result);
       }
@@ -164,18 +159,17 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
             <Text>Status: {getModelStatus()}</Text>
           </View>
 
-          {/* CHANGE 7: Add UI for the Toggle */}
           <View style={styles.toggleContainer}>
             <Text style={styles.toggleLabel}>Enable Timestamps</Text>
             <Switch
               value={enableTimestamps}
               onValueChange={(val) => {
                 setEnableTimestamps(val);
-                setTranscription(val ? [] : ''); // Reset transcription on toggle
+                setTranscription(val ? [] : '');
               }}
               trackColor={{ false: '#767577', true: '#0f186e' }}
               thumbColor={enableTimestamps ? '#fff' : '#f4f3f4'}
-              disabled={model.isGenerating} // Disable changing mode while running
+              disabled={model.isGenerating}
             />
           </View>
 

From b6bfdb72385724e9cff91da6d55a613d1d20c124 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 21 Jan 2026 16:46:49 +0100
Subject: [PATCH 07/14] Apply further clearing

---
 apps/speech/screens/SpeechToTextScreen.tsx    |  4 ---
 .../host_objects/JsiConversions.h             |  1 -
 .../host_objects/ModelHostObject.h            |  3 ---
 .../models/speech_to_text/SpeechToText.h      |  1 -
 .../stream/OnlineASRProcessor.cpp             | 25 ++++++-------------
 .../stream/OnlineASRProcessor.h               |  3 ---
 .../speech_to_text/types/ProcessResult.h      |  5 ----
 .../useSpeechToText.ts                        | 14 -----------
 8 files changed, 7 insertions(+), 49 deletions(-)

diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index ab3cdefb0..4be72abf4 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -84,8 +84,6 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
       const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
       const audioBuffer = decodedAudioData.getChannelData(0);
 
-      // CHANGE 4: Pass the toggle flag to transcribe
-      // TypeScript will infer the return type based on the flag
       if (enableTimestamps) {
         const result = await model.transcribe(audioBuffer, {
           enableTimestamps: enableTimestamps
@@ -240,7 +238,6 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
 };
 
 const styles = StyleSheet.create({
-  // ... existing styles ...
   container: {
     flex: 1,
     alignItems: 'center',
@@ -272,7 +269,6 @@ const styles = StyleSheet.create({
     marginTop: 12,
     alignItems: 'center',
   },
-  // New style for the toggle
   toggleContainer: {
     flexDirection: 'row',
     alignItems: 'center',
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index 95da364ff..e95e930e7 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -320,7 +320,6 @@ inline jsi::Value getJsiValue(const Word &word, jsi::Runtime &runtime) {
 inline jsi::Value getJsiValue(const std::vector<Word> &vec, jsi::Runtime &runtime) {
   jsi::Array array(runtime, vec.size());
   for (size_t i = 0; i < vec.size(); ++i) {
-    // Convert each Word using the helper above and place in array
     array.setValueAtIndex(runtime, i, getJsiValue(vec[i], runtime));
   }
   return {runtime, array};
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
index f0ec05b64..1843e8672 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -20,14 +20,11 @@
 #include <rnexecutorch/models/llm/LLM.h>
 #include <rnexecutorch/models/ocr/OCR.h>
 #include <rnexecutorch/models/speech_to_text/SpeechToText.h>
-#include <rnexecutorch/models/speech_to_text/types/Word.h>
 #include <rnexecutorch/models/text_to_image/TextToImage.h>
 #include <rnexecutorch/models/text_to_speech/TextToSpeech.h>
 #include <rnexecutorch/models/vertical_ocr/VerticalOCR.h>
 #include <rnexecutorch/threads/GlobalThreadPool.h>
 
-using rnexecutorch::models::speech_to_text::types::Word;
-
 namespace rnexecutorch {
 
 template <typename Model> class ModelHostObject : public JsiHostObject {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
index 8f6799c4e..883436f4a 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
@@ -23,7 +23,6 @@ class SpeechToText {
   [[nodiscard(
       "Registered non-void function")]] std::shared_ptr<OwningArrayBuffer>
   decode(std::span<int32_t> tokens, std::span<float> encoderOutput) const;
-  // [[nodiscard("Registered non-void function")]] std::vector<char>
   [[nodiscard("Registered non-void function")]] std::vector<Word>
   transcribe(std::span<float> waveform, std::string languageOption) const;
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp
index f62986b72..3137d274b 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp
@@ -34,12 +34,14 @@ ProcessResult OnlineASRProcessor::processIter(const DecodingOptions &options) {
     chunkCompletedSegment(res);
   }
 
+  auto move_to_vector = [](auto& container) {
+      return std::vector<Word>(std::make_move_iterator(container.begin()),
+                              std::make_move_iterator(container.end()));
+  };
+
   std::deque<Word> nonCommittedWords = this->hypothesisBuffer.complete();
-  // return {this->toFlush(flushed), this->toFlush(nonCommittedWords)};
-  return {std::vector<Word>(std::make_move_iterator(flushed.begin()),
-                           std::make_move_iterator(flushed.end())), 
-                           std::vector<Word>(std::make_move_iterator(nonCommittedWords.begin()),
-                           std::make_move_iterator(nonCommittedWords.end()))};
+
+  return { move_to_vector(flushed), move_to_vector(nonCommittedWords) };
 }
 
 void OnlineASRProcessor::chunkCompletedSegment(std::span<const Segment> res) {
@@ -86,22 +88,9 @@ std::vector<Word> OnlineASRProcessor::finish() {
   std::vector<Word> buffer(std::make_move_iterator(bufferDeq.begin()),
                            std::make_move_iterator(bufferDeq.end()));
 
-  // std::string committedText = this->toFlush(buffer);
   this->bufferTimeOffset += static_cast<float>(audioBuffer.size()) /
                             OnlineASRProcessor::kSamplingRate;
   return buffer;
 }
 
-// std::string OnlineASRProcessor::toFlush(const std::deque<Word> &words) const
-// {
-//   std::string text;
-//   text.reserve(std::accumulate(
-//       words.cbegin(), words.cend(), 0,
-//       [](size_t sum, const Word &w) { return sum + w.content.size(); }));
-//   for (const auto &word : words) {
-//     text.append(word.content);
-//   }
-//   return text;
-// }
-
 } // namespace rnexecutorch::models::speech_to_text::stream
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h
index 720e6bf76..3abaad3b6 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h
@@ -12,7 +12,6 @@ class OnlineASRProcessor {
 
   void insertAudioChunk(std::span<const float> audio);
   types::ProcessResult processIter(const types::DecodingOptions &options);
-  // std::string finish();
   std::vector<Word> finish();
 
   std::vector<float> audioBuffer;
@@ -27,8 +26,6 @@ class OnlineASRProcessor {
 
   void chunkCompletedSegment(std::span<const types::Segment> res);
   void chunkAt(float time);
-
-  // std::string toFlush(const std::deque<types::Word> &words) const;
 };
 
 } // namespace rnexecutorch::models::speech_to_text::stream
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h
index 685ba2b76..681495e2a 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h
@@ -4,11 +4,6 @@
 
 namespace rnexecutorch::models::speech_to_text::types {
 
-// struct ProcessResult {
-//   std::string committed;
-//   std::string nonCommitted;
-// };
-
 struct ProcessResult {
   std::vector<Word> committed;
   std::vector<Word> nonCommitted;
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
index 40801c7b3..2ff785490 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -21,7 +21,6 @@ export const useSpeechToText = ({
 
   const [modelInstance] = useState(() => new SpeechToTextModule());
 
-  // FIX 1: Allow state to be either string or Word[]
   const [committedTranscription, setCommittedTranscription] = useState<
     string | Word[]
   >('');
@@ -77,12 +76,6 @@ export const useSpeechToText = ({
 
   const stream = useCallback(
     async (options?: DecodingOptions & { enableTimestamps?: boolean }) => {
-      console.log(
-        '[2] Hook: Stream called. Ready:',
-        isReady,
-        'Generating:',
-        isGenerating
-      );
       if (!isReady)
         throw new RnExecutorchError(
           RnExecutorchErrorCode.ModuleNotLoaded,
@@ -96,7 +89,6 @@ export const useSpeechToText = ({
 
       setIsGenerating(true);
 
-      // FIX 2: Reset based on the mode requested
       const enableTimestamps = options?.enableTimestamps ?? false;
       setCommittedTranscription(enableTimestamps ? [] : '');
       setNonCommittedTranscription(enableTimestamps ? [] : '');
@@ -104,18 +96,13 @@ export const useSpeechToText = ({
       let fullResult: string | Word[] = enableTimestamps ? [] : '';
 
       try {
-        console.log('[3] Hook: Calling modelInstance.stream()');
-        // @ts-ignore - Typescript struggles with the dual generator return type, but logic is safe
         for await (const { committed, nonCommitted } of modelInstance.stream(
           options
         )) {
           console.log(committed, nonCommitted);
-          // FIX 3: Dynamic Merging Logic
           if (typeof committed === 'string') {
-            // --- STRING MODE ---
             if (committed.length > 0) {
               setCommittedTranscription((prev) => {
-                // Safety check: if prev was somehow an array, reset it or cast to string
                 const prevStr = typeof prev === 'string' ? prev : '';
                 return prevStr + committed;
               });
@@ -123,7 +110,6 @@ export const useSpeechToText = ({
             }
             setNonCommittedTranscription(nonCommitted as string);
           } else {
-            // --- WORD[] MODE ---
             const committedWords = committed as Word[];
             const nonCommittedWords = nonCommitted as Word[];
 

From dfea40e05da8c313d2e6dec379897e29dbf12bac Mon Sep 17 00:00:00 2001
From: Mateusz Sluszniak <56299341+msluszniak@users.noreply.github.com>
Date: Wed, 21 Jan 2026 17:15:36 +0100
Subject: [PATCH 08/14] Apply suggestion from @msluszniak

---
 .../hooks/natural_language_processing/useSpeechToText.ts | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
index 2ff785490..2afca892c 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -49,7 +49,14 @@ export const useSpeechToText = ({
         setError(parseUnknownError(err));
       }
     })();
-  }, [modelInstance, model, preventLoad]);
+  }, [
+    modelInstance,
+    model.isMultilingual,
+    model.encoderSource,
+    model.decoderSource,
+    model.tokenizerSource,
+    preventLoad,
+  ]);
 
   const stateWrapper = useCallback(
     <T extends (...args: any[]) => Promise<any>>(fn: T) =>

From 41839d0c634ebee4a5affcdeaad59f9dfc1681b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 21 Jan 2026 17:31:07 +0100
Subject: [PATCH 09/14] Apply autofix lint changes

---
 apps/speech/screens/SpeechToTextScreen.tsx                    | 2 +-
 .../modules/natural_language_processing/SpeechToTextModule.ts | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index 4be72abf4..87ad8bd50 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -86,7 +86,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
 
       if (enableTimestamps) {
         const result = await model.transcribe(audioBuffer, {
-          enableTimestamps: enableTimestamps
+          enableTimestamps: enableTimestamps,
         });
         setTranscription(result);
       }
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
index 1ad2f6c97..61162078c 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
@@ -135,9 +135,7 @@ export class SpeechToTextModule {
     options?: DecodingOptions & { enableTimestamps?: false | undefined }
   ): AsyncGenerator<{ committed: string; nonCommitted: string }>;
 
-  public async *stream(
-    options: DecodingOptions = {}
-  ): AsyncGenerator<{
+  public async *stream(options: DecodingOptions = {}): AsyncGenerator<{
     committed: string | Word[];
     nonCommitted: string | Word[];
   }> {

From 2f916f8cbca0ae41e8778ebdc600efa9feb4da10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 21 Jan 2026 17:57:16 +0100
Subject: [PATCH 10/14] Fix linter issues

---
 apps/llm/app/voice_chat/index.tsx             | 14 ++++++++++--
 .../useSpeechToText.ts                        | 22 +++++++++++++------
 .../SpeechToTextModule.ts                     |  6 ++++-
 3 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/apps/llm/app/voice_chat/index.tsx b/apps/llm/app/voice_chat/index.tsx
index 79a713c93..0bf4c9b30 100644
--- a/apps/llm/app/voice_chat/index.tsx
+++ b/apps/llm/app/voice_chat/index.tsx
@@ -76,7 +76,11 @@ function VoiceChatScreen() {
       });
       recorder.start();
       const transcription = await speechToText.stream();
-      await llm.sendMessage(transcription);
+      await llm.sendMessage(
+        typeof transcription === 'string'
+          ? transcription
+          : transcription.map((w) => w.word).join(' ')
+      );
     }
   };
 
@@ -105,7 +109,13 @@ function VoiceChatScreen() {
                       ...llm.messageHistory,
                       {
                         role: 'user',
-                        content: speechToText.committedTranscription,
+                        content:
+                          typeof speechToText.committedTranscription ===
+                          'string'
+                            ? speechToText.committedTranscription
+                            : speechToText.committedTranscription
+                                .map((w) => w.word)
+                                .join(' '),
                       },
                     ]
                   : llm.messageHistory
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
index 2afca892c..da6549bec 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -103,11 +103,17 @@ export const useSpeechToText = ({
       let fullResult: string | Word[] = enableTimestamps ? [] : '';
 
       try {
-        for await (const { committed, nonCommitted } of modelInstance.stream(
-          options
-        )) {
-          console.log(committed, nonCommitted);
+        const streamGen = modelInstance.stream(
+          options as any
+        ) as AsyncGenerator<{
+          committed: string | Word[];
+          nonCommitted: string | Word[];
+        }>;
+
+        for await (const { committed, nonCommitted } of streamGen) {
           if (typeof committed === 'string') {
+            const nc = nonCommitted as unknown as string;
+
             if (committed.length > 0) {
               setCommittedTranscription((prev) => {
                 const prevStr = typeof prev === 'string' ? prev : '';
@@ -115,12 +121,12 @@ export const useSpeechToText = ({
               });
               (fullResult as string) += committed;
             }
-            setNonCommittedTranscription(nonCommitted as string);
+            setNonCommittedTranscription(nc);
           } else {
             const committedWords = committed as Word[];
             const nonCommittedWords = nonCommitted as Word[];
 
-            if (committedWords.length > 0) {
+            if (committedWords && committedWords.length > 0) {
               setCommittedTranscription((prev) => {
                 const prevArr = Array.isArray(prev) ? prev : [];
                 return [...prevArr, ...committedWords];
@@ -161,7 +167,9 @@ export const useSpeechToText = ({
     nonCommittedTranscription,
     encode: stateWrapper(SpeechToTextModule.prototype.encode),
     decode: stateWrapper(SpeechToTextModule.prototype.decode),
-    transcribe: stateWrapper(SpeechToTextModule.prototype.transcribe),
+    transcribe: stateWrapper(
+      SpeechToTextModule.prototype.transcribe
+    ) as SpeechToTextModule['transcribe'],
     stream,
     streamStop: wrapper(SpeechToTextModule.prototype.streamStop),
     streamInsert: wrapper(SpeechToTextModule.prototype.streamInsert),
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
index 61162078c..98520a2e7 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
@@ -95,11 +95,13 @@ export class SpeechToTextModule {
     options?: DecodingOptions & { enableTimestamps: true }
   ): Promise<Word[]>;
 
+  // eslint-disable-next-line no-dupe-class-members
   public async transcribe(
     waveform: Float32Array | number[],
     options?: DecodingOptions & { enableTimestamps?: false | undefined }
   ): Promise<string>;
 
+  // eslint-disable-next-line no-dupe-class-members
   public async transcribe(
     waveform: Float32Array | number[],
     options: DecodingOptions = {}
@@ -131,10 +133,12 @@ export class SpeechToTextModule {
     options: DecodingOptions & { enableTimestamps: true }
   ): AsyncGenerator<{ committed: Word[]; nonCommitted: Word[] }>;
 
+  // eslint-disable-next-line no-dupe-class-members
   public stream(
     options?: DecodingOptions & { enableTimestamps?: false | undefined }
   ): AsyncGenerator<{ committed: string; nonCommitted: string }>;
 
+  // eslint-disable-next-line no-dupe-class-members
   public async *stream(options: DecodingOptions = {}): AsyncGenerator<{
     committed: string | Word[];
     nonCommitted: string | Word[];
@@ -173,7 +177,7 @@ export class SpeechToTextModule {
                 ),
               });
             } catch (err) {
-              console.error('[Stream Decode Error]', err);
+              Logger.error('[Stream Decode Error]', err);
             }
           } else {
             queue.push({ committed, nonCommitted });

From 55f03be5ad90c4b867d6592ecd909a98865d7ff8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 21 Jan 2026 18:40:42 +0100
Subject: [PATCH 11/14] Revert changing error messages

---
 .../hooks/natural_language_processing/useSpeechToText.ts  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
index da6549bec..c36e802db 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -64,12 +64,12 @@ export const useSpeechToText = ({
         if (!isReady)
           throw new RnExecutorchError(
             RnExecutorchErrorCode.ModuleNotLoaded,
-            'The model is currently not loaded.'
+            'The model is currently not loaded. Please load the model before calling this function.'
           );
         if (isGenerating)
           throw new RnExecutorchError(
             RnExecutorchErrorCode.ModelGenerating,
-            'The model is currently generating.'
+            'The model is currently generating. Please wait until previous model run is complete.'
           );
         setIsGenerating(true);
         try {
@@ -86,12 +86,12 @@ export const useSpeechToText = ({
       if (!isReady)
         throw new RnExecutorchError(
           RnExecutorchErrorCode.ModuleNotLoaded,
-          'Model not loaded'
+          'The model is currently not loaded. Please load the model before calling this function.'
         );
       if (isGenerating)
         throw new RnExecutorchError(
           RnExecutorchErrorCode.ModelGenerating,
-          'Model is generating'
+          'The model is currently generating. Please wait until previous model run is complete.'
         );
 
       setIsGenerating(true);

From 2fafd87ab41691edf0e902a9d35ebb2ae85b0ef4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 21 Jan 2026 18:42:25 +0100
Subject: [PATCH 12/14] Revert one more message

---
 .../src/hooks/natural_language_processing/useSpeechToText.ts    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
index c36e802db..107053a47 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -150,7 +150,7 @@ export const useSpeechToText = ({
         if (!isReady)
           throw new RnExecutorchError(
             RnExecutorchErrorCode.ModuleNotLoaded,
-            'Model not loaded'
+            'The model is currently not loaded. Please load the model before calling this function.'
           );
         return fn.apply(modelInstance, args);
       };

From d394088b5b350c824d8e8fb33360740dd867ac87 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 21 Jan 2026 19:42:44 +0100
Subject: [PATCH 13/14] Update docs

---
 .../useSpeechToText.md                        | 105 +++++++++++++-----
 .../SpeechToTextModule.md                     |  56 +++++++---
 2 files changed, 120 insertions(+), 41 deletions(-)

diff --git a/docs/docs/02-hooks/01-natural-language-processing/useSpeechToText.md b/docs/docs/02-hooks/01-natural-language-processing/useSpeechToText.md
index d94c96a66..b3171c77f 100644
--- a/docs/docs/02-hooks/01-natural-language-processing/useSpeechToText.md
+++ b/docs/docs/02-hooks/01-natural-language-processing/useSpeechToText.md
@@ -75,25 +75,31 @@ For more information on loading resources, take a look at [loading models](../..
 
 ### Returns
 
-| Field                       | Type                                                                                                 | Description                                                                                                                                                                                                                                                                                                                   |
-| --------------------------- | ---------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `transcribe`                | `(waveform: Float32Array \| number[], options?: DecodingOptions \| undefined) => Promise<string>`    | Starts a transcription process for a given input array, which should be a waveform at 16kHz. The second argument is an options object, e.g. `{ language: 'es' }` for multilingual models. Resolves a promise with the output transcription when the model is finished. Passing `number[]` is deprecated.                      |
-| `stream`                    | `(options?: DecodingOptions \| undefined) => Promise<string>`                                        | Starts a streaming transcription process. Use in combination with `streamInsert` to feed audio chunks and `streamStop` to end the stream. The argument is an options object, e.g. `{ language: 'es' }` for multilingual models. Updates `committedTranscription` and `nonCommittedTranscription` as transcription progresses. |
-| `streamInsert`              | `(waveform: Float32Array \| number[]) => void`                                                       | Inserts a chunk of audio data (sampled at 16kHz) into the ongoing streaming transcription. Call this repeatedly as new audio data becomes available. Passing `number[]` is deprecated.                                                                                                                                        |
-| `streamStop`                | `() => void`                                                                                         | Stops the ongoing streaming transcription process.                                                                                                                                                                                                                                                                            |
-| `encode`                    | `(waveform: Float32Array \| number[]) => Promise<Float32Array>`                                      | Runs the encoding part of the model on the provided waveform. Passing `number[]` is deprecated.                                                                                                                                                                                                                               |
-| `decode`                    | `(tokens: number[] \| Int32Array, encoderOutput: Float32Array \| number[]) => Promise<Float32Array>` | Runs the decoder of the model. Passing `number[]` is deprecated.                                                                                                                                                                                                                                                              |
-| `committedTranscription`    | `string`                                                                                             | Contains the part of the transcription that is finalized and will not change. Useful for displaying stable results during streaming.                                                                                                                                                                                          |
-| `nonCommittedTranscription` | `string`                                                                                             | Contains the part of the transcription that is still being processed and may change. Useful for displaying live, partial results during streaming.                                                                                                                                                                            |
-| `error`                     | `string \| null`                                                                                     | Contains the error message if the model failed to load.                                                                                                                                                                                                                                                                       |
-| `isGenerating`              | `boolean`                                                                                            | Indicates whether the model is currently processing an inference.                                                                                                                                                                                                                                                             |
-| `isReady`                   | `boolean`                                                                                            | Indicates whether the model has successfully loaded and is ready for inference.                                                                                                                                                                                                                                               |
-| `downloadProgress`          | `number`                                                                                             | Tracks the progress of the model download process.                                                                                                                                                                                                                                                                            |
+| Field                       | Type                                                                                                 | Description                                                                                                                                                                            |
+| --------------------------- | ---------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `transcribe`                | `(waveform: Float32Array                                                                             | number[], options?: DecodingOptions & { enableTimestamps?: boolean }) => Promise<string                                                                                                | Word[]>`                                                                                                                                                                                                                                                                                                                                                                                                                                             | Starts a transcription process for a given input array, which should be a waveform at 16kHz. For multilingual models, specify the language in `options`, e.g. `{ language: 'es' }` for multilingual models. If `enableTimestamps` is true, returns transcription with timestamps (`Word[]>`). If `enableTimestamps` is false (default), returns transcription as a string. Resolves a promise with the output transcription when the model is finished. Passing `number[]` is deprecated. |
+| `stream`                    | `(options?: DecodingOptions & { enableTimestamps?: boolean }) => Promise<string                      | Word[]>`                                                                                                                                                                               | Starts a streaming transcription process. Use in combination with `streamInsert` to feed audio chunks and `streamStop` to end the stream. The argument is an options object, e.g. `{ language: 'es' }` for multilingual models. As in `transcribe`, you can decide either you want transcription with timestamps or not by setting `enableTimestamps`. Updates `committedTranscription` and `nonCommittedTranscription` as transcription progresses. |
+| `streamInsert`              | `(waveform: Float32Array \| number[]) => void`                                                       | Inserts a chunk of audio data (sampled at 16kHz) into the ongoing streaming transcription. Call this repeatedly as new audio data becomes available. Passing `number[]` is deprecated. |
+| `streamStop`                | `() => void`                                                                                         | Stops the ongoing streaming transcription process.                                                                                                                                     |
+| `encode`                    | `(waveform: Float32Array \| number[]) => Promise<Float32Array>`                                      | Runs the encoding part of the model on the provided waveform. Passing `number[]` is deprecated.                                                                                        |
+| `decode`                    | `(tokens: number[] \| Int32Array, encoderOutput: Float32Array \| number[]) => Promise<Float32Array>` | Runs the decoder of the model. Passing `number[]` is deprecated.                                                                                                                       |
+| `committedTranscription`    | `string`                                                                                             | Contains the part of the transcription that is finalized and will not change. Useful for displaying stable results during streaming.                                                   |
+| `nonCommittedTranscription` | `string`                                                                                             | Contains the part of the transcription that is still being processed and may change. Useful for displaying live, partial results during streaming.                                     |
+| `error`                     | `string \| null`                                                                                     | Contains the error message if the model failed to load.                                                                                                                                |
+| `isGenerating`              | `boolean`                                                                                            | Indicates whether the model is currently processing an inference.                                                                                                                      |
+| `isReady`                   | `boolean`                                                                                            | Indicates whether the model has successfully loaded and is ready for inference.                                                                                                        |
+| `downloadProgress`          | `number`                                                                                             | Tracks the progress of the model download process.                                                                                                                                     |
 
 <details>
 <summary>Type definitions</summary>
 
 ```typescript
+interface Word {
+  word: string;
+  start: number;
+  end: number;
+}
+
 // Languages supported by whisper (Multilingual)
 type SpeechToTextLanguage =
   | 'af'
@@ -174,6 +180,7 @@ type SpeechToTextLanguage =
 
 interface DecodingOptions {
   language?: SpeechToTextLanguage;
+  enableTimestamps?: boolean;
 }
 
 interface SpeechToTextModelConfig {
@@ -204,12 +211,25 @@ const model = useSpeechToText({
 const transcription = await model.transcribe(spanishAudio, { language: 'es' });
 ```
 
+### Timestamps
+
+You can obtain word-level timestamps by setting `enableTimestamps: true` in the options. This changes the return type from a string to an array of `Word` objects.
+
+```typescript
+const words = await model.transcribe(audioBuffer, { enableTimestamps: true });
+// words: [{ word: "Hello", start: 0.0, end: 0.4 }, ...]
+```
+
 ## Example
 
 ```tsx
 import React, { useState } from 'react';
-import { Button, Text } from 'react-native';
-import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch';
+import { Button, Text, View } from 'react-native';
+import {
+  useSpeechToText,
+  WHISPER_TINY_EN,
+  Word,
+} from 'react-native-executorch';
 import { AudioContext } from 'react-native-audio-api';
 import * as FileSystem from 'expo-file-system';
 
@@ -218,7 +238,7 @@ function App() {
     model: WHISPER_TINY_EN,
   });
 
-  const [transcription, setTranscription] = useState('');
+  const [transcription, setTranscription] = useState<string | Word[]>('');
 
   const loadAudio = async () => {
     const { uri } = await FileSystem.downloadAsync(
@@ -235,14 +255,38 @@ function App() {
 
   const handleTranscribe = async () => {
     const audio = await loadAudio();
-    await model.transcribe(audio);
+    // Default text transcription
+    const result = await model.transcribe(audio);
+    setTranscription(result);
+  };
+
+  const handleTranscribeWithTimestamps = async () => {
+    const audio = await loadAudio();
+    // Transcription with timestamps
+    const result = await model.transcribe(audio, { enableTimestamps: true });
+    setTranscription(result);
+  };
+
+  const renderContent = () => {
+    if (typeof transcription === 'string') {
+      return <Text>{transcription}</Text>;
+    }
+    return transcription.map((w, i) => (
+      <Text key={i}>
+        {w.word} ({w.start.toFixed(2)}s)
+      </Text>
+    ));
   };
 
   return (
-    <>
-      <Text>{transcription}</Text>
-      <Button onPress={handleTranscribe} title="Transcribe" />
-    </>
+    <View>
+      {renderContent()}
+      <Button onPress={handleTranscribe} title="Transcribe (Text)" />
+      <Button
+        onPress={handleTranscribeWithTimestamps}
+        title="Transcribe (Timestamps)"
+      />
+    </View>
   );
 }
 ```
@@ -251,7 +295,7 @@ function App() {
 
 ```tsx
 import React, { useEffect, useState } from 'react';
-import { Text, Button } from 'react-native';
+import { Text, Button, View } from 'react-native';
 import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch';
 import { AudioManager, AudioRecorder } from 'react-native-audio-api';
 import * as FileSystem from 'expo-file-system';
@@ -285,6 +329,7 @@ function App() {
     recorder.start();
 
     try {
+      // Pass { enableTimestamps: true } here if you want Word[] updates
       await model.stream();
     } catch (error) {
       console.error('Error during streaming transcription:', error);
@@ -296,18 +341,24 @@ function App() {
     model.streamStop();
   };
 
+  // Helper to safely render mixed types
+  const getText = (data: string | any[]) => {
+    if (typeof data === 'string') return data;
+    return data.map((w) => w.word).join('');
+  };
+
   return (
-    <>
+    <View>
       <Text>
-        {model.committedTranscription}
-        {model.nonCommittedTranscription}
+        {getText(model.committedTranscription)}
+        {getText(model.nonCommittedTranscription)}
       </Text>
       <Button
         onPress={handleStartStreamingTranscribe}
         title="Start Streaming"
       />
       <Button onPress={handleStopStreamingTranscribe} title="Stop Streaming" />
-    </>
+    </View>
   );
 }
 ```
diff --git a/docs/docs/03-typescript-api/01-natural-language-processing/SpeechToTextModule.md b/docs/docs/03-typescript-api/01-natural-language-processing/SpeechToTextModule.md
index f93600c00..db0266580 100644
--- a/docs/docs/03-typescript-api/01-natural-language-processing/SpeechToTextModule.md
+++ b/docs/docs/03-typescript-api/01-natural-language-processing/SpeechToTextModule.md
@@ -14,21 +14,27 @@ await model.load(WHISPER_TINY_EN, (progress) => {
   console.log(progress);
 });
 
-await model.transcribe(waveform);
+// Standard transcription (returns string)
+const text = await model.transcribe(waveform);
+
+// Transcription with timestamps (returns Word[])
+const textWithTimestamps = await model.transcribe(waveform, {
+  enableTimestamps: true,
+});
 ```
 
 ### Methods
 
-| Method         | Type                                                                                                       | Description                                                                                                                                                                                                   |
-| -------------- | ---------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `load`         | `(model: SpeechToTextModelConfig, onDownloadProgressCallback?: (progress: number) => void): Promise<void>` | Loads the model specified by the config object. `onDownloadProgressCallback` allows you to monitor the current progress of the model download.                                                                |
-| `delete`       | `(): void`                                                                                                 | Unloads the model from memory.                                                                                                                                                                                |
-| `encode`       | `(waveform: Float32Array \| number[]): Promise<Float32Array>`                                              | Runs the encoding part of the model on the provided waveform. Returns the encoded waveform as a Float32Array. Passing `number[]` is deprecated.                                                               |
-| `decode`       | `(tokens: number[] \| Int32Array, encoderOutput: Float32Array \| number[]): Promise<Float32Array>`         | Runs the decoder of the model. Passing `number[]` is deprecated.                                                                                                                                              |
-| `transcribe`   | `(waveform: Float32Array \| number[], options?: DecodingOptions): Promise<string>`                         | Starts a transcription process for a given input array (16kHz waveform). For multilingual models, specify the language in `options`. Returns the transcription as a string. Passing `number[]` is deprecated. |
-| `stream`       | `(options?: DecodingOptions): AsyncGenerator<{ committed: string; nonCommitted: string }>`                 | Starts a streaming transcription session. Yields objects with `committed` and `nonCommitted` transcriptions. Use with `streamInsert` and `streamStop` to control the stream.                                  |
-| `streamStop`   | `(): void`                                                                                                 | Stops the current streaming transcription session.                                                                                                                                                            |
-| `streamInsert` | `(waveform: Float32Array \| number[]): void`                                                               | Inserts a new audio chunk into the streaming transcription session. Passing `number[]` is deprecated.                                                                                                         |
+| Method         | Type                                                                                                       | Description                                                                                                                                                                                                                                                                                         |
+| -------------- | ---------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `load`         | `(model: SpeechToTextModelConfig, onDownloadProgressCallback?: (progress: number) => void): Promise<void>` | Loads the model specified by the config object. `onDownloadProgressCallback` allows you to monitor the current progress of the model download.                                                                                                                                                      |
+| `delete`       | `(): void`                                                                                                 | Unloads the model from memory.                                                                                                                                                                                                                                                                      |
+| `encode`       | `(waveform: Float32Array \| number[]): Promise<Float32Array>`                                              | Runs the encoding part of the model on the provided waveform. Returns the encoded waveform as a Float32Array. Passing `number[]` is deprecated.                                                                                                                                                     |
+| `decode`       | `(tokens: number[] \| Int32Array, encoderOutput: Float32Array \| number[]): Promise<Float32Array>`         | Runs the decoder of the model. Passing `number[]` is deprecated.                                                                                                                                                                                                                                    |
+| `transcribe`   | `(waveform: Float32Array                                                                                   | number[], options?: DecodingOptions & { enableTimestamps?: boolean }): Promise<string                                                                                                                                                                                                               | Word[]>` | Starts a transcription process for a given input array (16kHz waveform). For multilingual models, specify the language in `options`. If `enableTimestamps` is true, returns transcription with timestamps (`Word[]>`). If `enableTimestamps` is false (default), returns transcription as a string. Passing `number[]` is deprecated. |
+| `stream`       | `(options?: DecodingOptions & { enableTimestamps?: boolean }): AsyncGenerator<StreamResult>`               | Starts a streaming transcription session. Yields objects with `committed` and `nonCommitted` transcriptions. As in `transcribe`, you can decide either you want transcription with timestamps or not by setting `enableTimestamps`. Use with `streamInsert` and `streamStop` to control the stream. |
+| `streamStop`   | `(): void`                                                                                                 | Stops the current streaming transcription session.                                                                                                                                                                                                                                                  |
+| `streamInsert` | `(waveform: Float32Array \| number[]): void`                                                               | Inserts a new audio chunk into the streaming transcription session. Passing `number[]` is deprecated.                                                                                                                                                                                               |
 
 :::info
 
@@ -40,6 +46,12 @@ await model.transcribe(waveform);
 <summary>Type definitions</summary>
 
 ```typescript
+interface Word {
+  word: string;
+  start: number;
+  end: number;
+}
+
 // Languages supported by whisper (Multilingual)
 type SpeechToTextLanguage =
   | 'af'
@@ -120,6 +132,7 @@ type SpeechToTextLanguage =
 
 interface DecodingOptions {
   language?: SpeechToTextLanguage;
+  enableTimestamps?: boolean;
 }
 
 interface SpeechToTextModelConfig {
@@ -154,7 +167,7 @@ For more information on loading resources, take a look at [loading models](../..
 
 ## Running the model
 
-To run the model, you can use the `transcribe` method. It accepts one argument, which is an array of numbers representing a waveform at 16kHz sampling rate. The method returns a promise, which can resolve either to an error or a string containing the output text.
+To run the model, you can use the `transcribe` method. It accepts one argument, which is an array of numbers representing a waveform at 16kHz sampling rate. The method returns a promise, which can resolve either to an error, a string (text only), or an array of `Word` objects (text with timestamps), depending on the `enableTimestamps` option.
 
 ### Multilingual transcription
 
@@ -171,6 +184,15 @@ await model.load(WHISPER_TINY, (progress) => {
 const transcription = await model.transcribe(spanishAudio, { language: 'es' });
 ```
 
+### Timestamps
+
+To get word-level timestamps, set `enableTimestamps` to `true`.
+
+```typescript
+const words = await model.transcribe(audioBuffer, { enableTimestamps: true });
+// words: [{ word: "Hello", start: 0.0, end: 0.5 }, ...]
+```
+
 ## Example
 
 ### Transcription
@@ -196,8 +218,13 @@ const audioBuffer = decodedAudioData.getChannelData(0);
 
 // Transcribe the audio
 try {
-  const transcription = await model.transcribe(audioBuffer);
-  console.log(transcription);
+  // Option 1: Text only
+  const text = await model.transcribe(audioBuffer);
+  console.log('Text:', text);
+
+  // Option 2: With timestamps
+  const words = await model.transcribe(audioBuffer, { enableTimestamps: true });
+  console.log('Words:', words);
 } catch (error) {
   console.error('Error during audio transcription', error);
 }
@@ -237,6 +264,7 @@ recorder.start();
 // Start streaming transcription
 try {
   let transcription = '';
+  // Note: Pass { enableTimestamps: true } here to get Word[] objects instead
   for await (const { committed, nonCommitted } of model.stream()) {
     console.log('Streaming transcription:', { committed, nonCommitted });
     transcription += committed;

From 85fe48579ef414db9bc17a71f48e5b1b2bbf0f19 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 21 Jan 2026 20:08:18 +0100
Subject: [PATCH 14/14] Fix error in demo app

---
 apps/speech/screens/SpeechToTextScreen.tsx | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index 87ad8bd50..98fff26a4 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -84,12 +84,10 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
       const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
       const audioBuffer = decodedAudioData.getChannelData(0);
 
-      if (enableTimestamps) {
-        const result = await model.transcribe(audioBuffer, {
-          enableTimestamps: enableTimestamps,
-        });
-        setTranscription(result);
-      }
+      const result = await model.transcribe(audioBuffer, {
+        enableTimestamps: enableTimestamps as any,
+      });
+      setTranscription(result);
     } catch (error) {
       console.error('Error decoding audio data', error);
       console.warn('Note: Supported file formats: mp3, wav, flac');