ahmetoner · ntviet18 · Oct 27, 2025
diff --git a/app/utils.py b/app/utils.py
@@ -1,5 +1,6 @@
 import json
 import os
+import tempfile
 from dataclasses import asdict
 from typing import BinaryIO, TextIO
 
@@ -94,34 +95,44 @@ def write_result(self, result: dict, file: TextIO):
         json.dump(result, file)
 
 
-def load_audio(file: BinaryIO, encode=True, sr: int = CONFIG.SAMPLE_RATE):
+def load_audio(file, encode: bool = True, sr: int = CONFIG.SAMPLE_RATE):
     """
     Open an audio file object and read as mono waveform, resampling as necessary.
-    Modified from https://github.com/openai/whisper/blob/main/whisper/audio.py to accept a file object
+    Always writes the input to a temp file so ffmpeg has a seekable source.
+
     Parameters
     ----------
-    file: BinaryIO
-        The audio file like object
-    encode: Boolean
-        If true, encode audio stream to WAV before sending to whisper
-    sr: int
-        The sample rate to resample the audio if necessary
+    file : BinaryIO
+        The audio file-like object.
+    encode : bool
+        If true, re-encode audio stream to PCM WAV-like raw s16le before returning.
+    sr : int
+        The sample rate to resample the audio if necessary.
+
     Returns
     -------
-    A NumPy array containing the audio waveform, in float32 dtype.
+    np.ndarray
+        A float32 NumPy array containing the waveform in range [-1.0, 1.0].
     """
-    if encode:
-        try:
-            # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
-            # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+    data = file.read()
+
+    if not encode:
+        # Raw PCM mode (assume s16le bytes already).
+        return np.frombuffer(data, np.int16).astype(np.float32) / 32768.0
+
+    try:
+        with tempfile.NamedTemporaryFile(suffix=".audio", delete=True) as tmp:
+            tmp.write(data)
+            tmp.flush()
+
             out, _ = (
-                ffmpeg.input("pipe:", threads=0)
-                .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
-                .run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True, input=file.read())
+                ffmpeg
+                .input(tmp.name)
+                .output("pipe:", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
+                .global_args("-nostdin", "-v", "error")
+                .run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True)
             )
-        except ffmpeg.Error as e:
-            raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
-    else:
-        out = file.read()
+    except ffmpeg.Error as e:
+        raise RuntimeError(f"Failed to load audio: {e.stderr.decode(errors='ignore')}") from e
 
-    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+    return np.frombuffer(out, np.int16).astype(np.float32) / 32768.0