Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 32 additions & 21 deletions app/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import os
import tempfile
from dataclasses import asdict
from typing import BinaryIO, TextIO

Expand Down Expand Up @@ -94,34 +95,44 @@ def write_result(self, result: dict, file: TextIO):
json.dump(result, file)


def load_audio(file: BinaryIO, encode=True, sr: int = CONFIG.SAMPLE_RATE):
def load_audio(file, encode: bool = True, sr: int = CONFIG.SAMPLE_RATE):
"""
Open an audio file object and read as mono waveform, resampling as necessary.
Modified from https://github.com/openai/whisper/blob/main/whisper/audio.py to accept a file object
Always writes the input to a temp file so ffmpeg has a seekable source.

Parameters
----------
file: BinaryIO
The audio file like object
encode: Boolean
If true, encode audio stream to WAV before sending to whisper
sr: int
The sample rate to resample the audio if necessary
file : BinaryIO
The audio file-like object.
encode : bool
If true, re-encode audio stream to PCM WAV-like raw s16le before returning.
sr : int
The sample rate to resample the audio if necessary.

Returns
-------
A NumPy array containing the audio waveform, in float32 dtype.
np.ndarray
A float32 NumPy array containing the waveform in range [-1.0, 1.0].
"""
if encode:
try:
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
data = file.read()

if not encode:
# Raw PCM mode (assume s16le bytes already).
return np.frombuffer(data, np.int16).astype(np.float32) / 32768.0

try:
with tempfile.NamedTemporaryFile(suffix=".audio", delete=True) as tmp:
tmp.write(data)
tmp.flush()

out, _ = (
ffmpeg.input("pipe:", threads=0)
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
.run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True, input=file.read())
ffmpeg
.input(tmp.name)
.output("pipe:", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
.global_args("-nostdin", "-v", "error")
.run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True)
)
except ffmpeg.Error as e:
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
else:
out = file.read()
except ffmpeg.Error as e:
raise RuntimeError(f"Failed to load audio: {e.stderr.decode(errors='ignore')}") from e

return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
return np.frombuffer(out, np.int16).astype(np.float32) / 32768.0