From f55ca2a3c17e6c0ee2af53d0ac6d12304a820e90 Mon Sep 17 00:00:00 2001 From: hyperuser178 Date: Sun, 14 Dec 2025 01:14:46 +0000 Subject: [PATCH 1/5] Add `ffmpeg`-based function `wavread` to read audio files This function reads audio files using ffmpeg, allowing for asynchronous reading and various output configurations. --- rvc/lib/uvr5_pack/ffio.py | 79 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 rvc/lib/uvr5_pack/ffio.py diff --git a/rvc/lib/uvr5_pack/ffio.py b/rvc/lib/uvr5_pack/ffio.py new file mode 100644 index 0000000..bdc74c8 --- /dev/null +++ b/rvc/lib/uvr5_pack/ffio.py @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- +import numpy as np, ffmpeg +## +def wavread( + filepath:str, fs:int=0, ch:int=0, + dtype=None, read_async=False, + **kwargs) -> tuple[np.ndarray, int]: + """ + Reads an audio file using `ffmpeg` & + returns the audio data as an `np.ndarray` & + the sample rate (`fs`). + Args: + filepath (str): Path to the audio file. + fs (int, optional): Desired sample rate. If 0, + the original sample rate is used. Defaults to 0. + ch (int, optional): Desired number of channels. If 0, + the original number of channels is used. Defaults to 0. + dtype (data-type, optional): Desired data type + for the output array. If None, the data is returned + as 32-bit float. Defaults to None. + read_async (bool, optional): If True, reads the audio data + asynchronously. Defaults to False. + **kwargs: Additional arguments. + Returns: + tuple[np.ndarray, int]: A tuple containing the audio data + as a NumPy array and the sample rate. + """ + ## Performing FFProbe the Audio File to get Stream Information + d_probe = ffmpeg.probe(filepath) + st_audio = next( + s for s in d_probe["streams"] \ + if s["codec_type"] == "audio") + ## Using original sample rate and channels if not specified + fs = fs or int(st_audio["sample_rate"]) + ch = ch or int(st_audio["channels"]) + ## Determining the float32 format based on system endianness + fp32 = "f4" + ffmpeg_format = "f32le" if np.little_endian else "f32be" + ffmpeg_acodec = f"pcm_{ffmpeg_format}" + ## Reading the Audio Asynchronously + if read_async: + async_pipe = ( + ffmpeg + .input(filepath) + .output("pipe:", + format = ffmpeg_format, + acodec = ffmpeg_acodec, + ac = ch, + ar = fs, + loglevel = "error") + .run_async(pipe_stdout=True)) + pcm_raw = async_pipe.stdout.read() + async_pipe.wait() + x_raw = np.frombuffer(pcm_raw, dtype=fp32) + ## Reading the Audio Synchronously + else: + pcm_out, _ = ( + ffmpeg + .input(filepath) + .output("pipe:", + format = ffmpeg_format, + acodec = ffmpeg_acodec, + ac = ch, + ar = fs, + loglevel = "error") + .run( + capture_stdout = True, + capture_stderr = True, + )) + x_raw = np.frombuffer(pcm_out, dtype=fp32) + ## Converting to the target data type if specified + if dtype: + dt = np.dtype(dtype) + if dt.kind == "i": + x_raw = np.clip(x_raw, -1., +1.) + x_raw = (x_raw * (np.iinfo(dt).max - 1)).astype(dt) + ## Returning with shapped as (channels, samples) + return x_raw.reshape(-1, ch).T, fs +## From 772ae871b26fcffd3524056b47949f015276a3c4 Mon Sep 17 00:00:00 2001 From: hyperuser178 Date: Sun, 14 Dec 2025 02:04:05 +0000 Subject: [PATCH 2/5] Added `wavread_rosa` based on `librosa` params & `librosa` based resampling Updated `wavread` function to include resampling using `librosa`, & added `wavread_rosa` for `librosa`-like params. Due to that the param `res_type` is defined in `librosa`, resampling is performed using `librosa.resample` as well. --- rvc/lib/uvr5_pack/ffio.py | 84 +++++++++++++++++++++++++++------------ 1 file changed, 59 insertions(+), 25 deletions(-) diff --git a/rvc/lib/uvr5_pack/ffio.py b/rvc/lib/uvr5_pack/ffio.py index bdc74c8..afdd208 100644 --- a/rvc/lib/uvr5_pack/ffio.py +++ b/rvc/lib/uvr5_pack/ffio.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- -import numpy as np, ffmpeg +import numpy as np, librosa as rosa, ffmpeg ## def wavread( filepath:str, fs:int=0, ch:int=0, - dtype=None, read_async=False, + dtype=None, read_async=False, res_type:str="soxr_hq", **kwargs) -> tuple[np.ndarray, int]: """ Reads an audio file using `ffmpeg` & @@ -20,6 +20,8 @@ def wavread( as 32-bit float. Defaults to None. read_async (bool, optional): If True, reads the audio data asynchronously. Defaults to False. + res_type (str, optional): Resampling type + defined in `librosa`, defaulting to "soxr_hq". **kwargs: Additional arguments. Returns: tuple[np.ndarray, int]: A tuple containing the audio data @@ -30,50 +32,82 @@ def wavread( st_audio = next( s for s in d_probe["streams"] \ if s["codec_type"] == "audio") - ## Using original sample rate and channels if not specified - fs = fs or int(st_audio["sample_rate"]) - ch = ch or int(st_audio["channels"]) + ch_origin = int(st_audio["channels"]) + fs_origin = int(st_audio["sample_rate"]) + ch = ch or ch_origin + fs = fs or fs_origin ## Determining the float32 format based on system endianness - fp32 = "f4" - ffmpeg_format = "f32le" if np.little_endian else "f32be" + fp32, ffmpeg_format = ("f4", "f32be") ffmpeg_acodec = f"pcm_{ffmpeg_format}" + ## Setting Keyword-based (non-positional) Args of `ffmpeg.output` + kwgs_output = { + "format": ffmpeg_format, + "acodec": ffmpeg_acodec, + ## Using original sample rate and channels if not specified + "ac": ch, + #"ar": fs, # Resampling later using `librosa` + "loglevel": "error"} ## Reading the Audio Asynchronously if read_async: async_pipe = ( ffmpeg .input(filepath) - .output("pipe:", - format = ffmpeg_format, - acodec = ffmpeg_acodec, - ac = ch, - ar = fs, - loglevel = "error") + .output("pipe:", **kwgs_output) .run_async(pipe_stdout=True)) pcm_raw = async_pipe.stdout.read() async_pipe.wait() - x_raw = np.frombuffer(pcm_raw, dtype=fp32) ## Reading the Audio Synchronously else: - pcm_out, _ = ( + pcm_raw, _ = ( ffmpeg .input(filepath) - .output("pipe:", - format = ffmpeg_format, - acodec = ffmpeg_acodec, - ac = ch, - ar = fs, - loglevel = "error") + .output("pipe:", **kwgs_output) .run( capture_stdout = True, capture_stderr = True, )) - x_raw = np.frombuffer(pcm_out, dtype=fp32) + ## Converting the Raw PCM Data to Float32 NumPy Array + x_raw = np.frombuffer(pcm_raw, dtype=fp32).reshape(-1, ch).T + ## Resampling using `librosa`, if necessary + x_res = rosa.resample(x_raw, orig_sr=fs_origin, target_sr=fs, + res_type=res_type, axis=-1) if fs != fs_origin else x_raw ## Converting to the target data type if specified if dtype: dt = np.dtype(dtype) if dt.kind == "i": - x_raw = np.clip(x_raw, -1., +1.) - x_raw = (x_raw * (np.iinfo(dt).max - 1)).astype(dt) + x_res = np.clip(x_res, -1., +1.) + x_res = (x_res * (np.iinfo(dt).max - 1)).astype(dt) ## Returning with shapped as (channels, samples) - return x_raw.reshape(-1, ch).T, fs + return x_res, fs +## +def wavread_rosa(filepath:str, + fs:int=22050, mono:bool=True, + dtype=np.float32, res_type:str="soxr_hq", + **kwargs) -> tuple[np.ndarray, int]: + """ + Reads an audio file using `ffmpeg` & + returns the audio data as an `np.ndarray` & + the sample rate (`fs`). + Args: + filepath (str): Path to the audio file. + fs (int, optional): Desired sample rate. Defaults to 22050. + mono (bool, optional): If True, converts the audio to mono. + Defaults to True. + dtype (data-type, optional): Desired data type + for the output array. If None, the data is returned + as 32-bit float. Defaults to `np.float32`. + res_type (str, optional): Resampling type + defined in `librosa`, defaulting to "soxr_hq". + **kwargs: Additional arguments. + Returns: + tuple[np.ndarray, int]: A tuple containing the audio data + as a NumPy array and the sample rate. + """ + return wavread(filepath, + fs = fs, + ch = 1 if mono else 2, + dtype = dtype, + res_type = res_type, + **kwargs) ## From ea236804e7551d53c8d2c542dd7614d676504650 Mon Sep 17 00:00:00 2001 From: hyperuser178 Date: Sun, 14 Dec 2025 02:10:55 +0000 Subject: [PATCH 3/5] Fix `wavread_rosa` call during handling single channel audio --- rvc/lib/uvr5_pack/ffio.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/rvc/lib/uvr5_pack/ffio.py b/rvc/lib/uvr5_pack/ffio.py index afdd208..dfe1354 100644 --- a/rvc/lib/uvr5_pack/ffio.py +++ b/rvc/lib/uvr5_pack/ffio.py @@ -104,10 +104,13 @@ def wavread_rosa(filepath:str, tuple[np.ndarray, int]: A tuple containing the audio data as a NumPy array and the sample rate. """ - return wavread(filepath, + x, fs = wavread(filepath, fs = fs, ch = 1 if mono else 2, dtype = dtype, res_type = res_type, **kwargs) + if x.shape[0] < 2: + x = x.squeeze(0) + return x, fs ## From e1192deabaae0b6b705c9f34b32123a5721427ca Mon Sep 17 00:00:00 2001 From: hyperuser178 Date: Sun, 14 Dec 2025 02:18:11 +0000 Subject: [PATCH 4/5] Implement ffmpeg loading option for audio processing Added option to load audio using ffmpeg in process method. --- rvc/modules/uvr5/vr.py | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/rvc/modules/uvr5/vr.py b/rvc/modules/uvr5/vr.py index caebe31..1482b69 100644 --- a/rvc/modules/uvr5/vr.py +++ b/rvc/modules/uvr5/vr.py @@ -12,6 +12,7 @@ from rvc.lib.uvr5_pack.lib_v5.model_param_init import ModelParameters from rvc.lib.uvr5_pack.lib_v5.nets_new import CascadedNet from rvc.lib.uvr5_pack.utils import inference +from rvc.lib.uvr5_pack.ffio import wavread_rosa as ffread_rosa logger = logging.getLogger(__name__) @@ -49,7 +50,11 @@ def __init__(self, model_path, agg, tta=False): def process( self, music_file, - ): + ## Param for trying to read audio using `ffmpeg`, + ## but still resampling using `librosa.resample`, + ## implemented in the file ".../uvr5_pack/ffio.py" + load_using_ffmpeg:bool = False, + **kwargs): x_wave, y_wave, x_spec_s, y_spec_s = {}, {}, {}, {} bands_n = len(self.mp.param["band"]) @@ -57,13 +62,22 @@ def process( bp = self.mp.param["band"][d] if d == bands_n: # high-end band # librosa loading may be buggy for some audio. ffmpeg will solve this, but it's a pain - x_wave[d] = librosa.core.load( - music_file, - sr=bp["sr"], - mono=False, - dtype=np.float32, - res_type=bp["res_type"], - )[0] + if load_using_ffmpeg: + x_wave[d] = ffread_rosa( + music_file, + fs = bp["sr"], + mono = False, + dtype = np.float32, + res_type = bp["res_type"], + )[0] + else: + x_wave[d] = librosa.core.load( + music_file, + sr=bp["sr"], + mono=False, + dtype=np.float32, + res_type=bp["res_type"], + )[0] if x_wave[d].ndim == 1: x_wave[d] = np.asfortranarray([x_wave[d], x_wave[d]]) else: # lower bands From 72396aa00fc448e2e402ed23e6e9522d67564359 Mon Sep 17 00:00:00 2001 From: hyperuser178 Date: Sun, 14 Dec 2025 02:23:09 +0000 Subject: [PATCH 5/5] Added an audio loading method based on `ffmpeg` as an option The function is implemented in new file `.../uvr5_pack/ffio.py`. However, it hasn't been performed serious unit tests yet due to that I'm not sure what kind of unit tests are required. --- rvc/modules/uvr5/vr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rvc/modules/uvr5/vr.py b/rvc/modules/uvr5/vr.py index 1482b69..8caefe0 100644 --- a/rvc/modules/uvr5/vr.py +++ b/rvc/modules/uvr5/vr.py @@ -62,7 +62,7 @@ def process( bp = self.mp.param["band"][d] if d == bands_n: # high-end band # librosa loading may be buggy for some audio. ffmpeg will solve this, but it's a pain - if load_using_ffmpeg: + if load_using_ffmpeg: # [TODO] Serious Unit Tests may be Required x_wave[d] = ffread_rosa( music_file, fs = bp["sr"],