From f55ca2a3c17e6c0ee2af53d0ac6d12304a820e90 Mon Sep 17 00:00:00 2001
From: hyperuser178 <hyperuser178@gmail.com>
Date: Sun, 14 Dec 2025 01:14:46 +0000
Subject: [PATCH 1/5] Add `ffmpeg`-based function `wavread` to read audio files

This function reads audio files using ffmpeg, allowing for asynchronous reading and various output configurations.
---
 rvc/lib/uvr5_pack/ffio.py | 79 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 rvc/lib/uvr5_pack/ffio.py

diff --git a/rvc/lib/uvr5_pack/ffio.py b/rvc/lib/uvr5_pack/ffio.py
new file mode 100644
index 0000000..bdc74c8
--- /dev/null
+++ b/rvc/lib/uvr5_pack/ffio.py
@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+import numpy as np, ffmpeg
+##
+def wavread(
+    filepath:str, fs:int=0, ch:int=0,
+    dtype=None, read_async=False,
+  **kwargs) -> tuple[np.ndarray, int]:
+    """
+    Reads an audio file using `ffmpeg` &
+        returns the audio data as an `np.ndarray` &
+        the sample rate (`fs`).
+    Args:
+        filepath (str): Path to the audio file.
+        fs (int, optional): Desired sample rate. If 0,
+            the original sample rate is used. Defaults to 0.
+        ch (int, optional): Desired number of channels. If 0,
+            the original number of channels is used. Defaults to 0.
+        dtype (data-type, optional): Desired data type
+            for the output array. If None, the data is returned
+            as 32-bit float. Defaults to None.
+        read_async (bool, optional): If True, reads the audio data
+            asynchronously. Defaults to False.
+        **kwargs: Additional arguments.
+    Returns:
+        tuple[np.ndarray, int]: A tuple containing the audio data
+            as a NumPy array and the sample rate.
+    """
+    ## Performing FFProbe the Audio File to get Stream Information
+    d_probe = ffmpeg.probe(filepath)
+    st_audio = next(
+        s for s in d_probe["streams"] \
+        if s["codec_type"] == "audio")
+    ## Using original sample rate and channels if not specified
+    fs = fs or int(st_audio["sample_rate"])
+    ch = ch or int(st_audio["channels"])
+    ## Determining the float32 format based on system endianness
+    fp32 = "<f4" if np.little_endian else ">f4"
+    ffmpeg_format = "f32le" if np.little_endian else "f32be"
+    ffmpeg_acodec = f"pcm_{ffmpeg_format}"
+    ## Reading the Audio Asynchronously
+    if read_async:
+        async_pipe = (
+            ffmpeg
+            .input(filepath)
+            .output("pipe:",
+                format = ffmpeg_format,
+                acodec = ffmpeg_acodec,
+                ac = ch,
+                ar = fs,
+                loglevel = "error")
+            .run_async(pipe_stdout=True))
+        pcm_raw = async_pipe.stdout.read()
+        async_pipe.wait()
+        x_raw = np.frombuffer(pcm_raw, dtype=fp32)
+    ## Reading the Audio Synchronously
+    else:
+        pcm_out, _ = (
+            ffmpeg
+            .input(filepath)
+            .output("pipe:",
+                format = ffmpeg_format,
+                acodec = ffmpeg_acodec,
+                ac = ch,
+                ar = fs,
+                loglevel = "error")
+            .run(
+                capture_stdout = True,
+                capture_stderr = True,
+                ))
+        x_raw = np.frombuffer(pcm_out, dtype=fp32)
+    ## Converting to the target data type if specified
+    if dtype:
+        dt = np.dtype(dtype)
+        if dt.kind == "i":
+            x_raw = np.clip(x_raw, -1., +1.)
+            x_raw = (x_raw * (np.iinfo(dt).max - 1)).astype(dt)
+    ## Returning with shapped as (channels, samples)
+    return x_raw.reshape(-1, ch).T, fs
+##

From 772ae871b26fcffd3524056b47949f015276a3c4 Mon Sep 17 00:00:00 2001
From: hyperuser178 <hyperuser178@gmail.com>
Date: Sun, 14 Dec 2025 02:04:05 +0000
Subject: [PATCH 2/5] Added `wavread_rosa` based on `librosa` params &
 `librosa` based resampling

Updated `wavread` function to include resampling using `librosa`, & added `wavread_rosa` for `librosa`-like params.

Due to that the param `res_type` is defined in `librosa`, resampling is performed using `librosa.resample` as well.
---
 rvc/lib/uvr5_pack/ffio.py | 84 +++++++++++++++++++++++++++------------
 1 file changed, 59 insertions(+), 25 deletions(-)

diff --git a/rvc/lib/uvr5_pack/ffio.py b/rvc/lib/uvr5_pack/ffio.py
index bdc74c8..afdd208 100644
--- a/rvc/lib/uvr5_pack/ffio.py
+++ b/rvc/lib/uvr5_pack/ffio.py
@@ -1,9 +1,9 @@
 # -*- coding: utf-8 -*-
-import numpy as np, ffmpeg
+import numpy as np, librosa as rosa, ffmpeg
 ##
 def wavread(
     filepath:str, fs:int=0, ch:int=0,
-    dtype=None, read_async=False,
+    dtype=None, read_async=False, res_type:str="soxr_hq",
   **kwargs) -> tuple[np.ndarray, int]:
     """
     Reads an audio file using `ffmpeg` &
@@ -20,6 +20,8 @@ def wavread(
             as 32-bit float. Defaults to None.
         read_async (bool, optional): If True, reads the audio data
             asynchronously. Defaults to False.
+        res_type (str, optional): Resampling type
+            defined in `librosa`, defaulting to "soxr_hq".
         **kwargs: Additional arguments.
     Returns:
         tuple[np.ndarray, int]: A tuple containing the audio data
@@ -30,50 +32,82 @@ def wavread(
     st_audio = next(
         s for s in d_probe["streams"] \
         if s["codec_type"] == "audio")
-    ## Using original sample rate and channels if not specified
-    fs = fs or int(st_audio["sample_rate"])
-    ch = ch or int(st_audio["channels"])
+    ch_origin = int(st_audio["channels"])
+    fs_origin = int(st_audio["sample_rate"])
+    ch = ch or ch_origin
+    fs = fs or fs_origin
     ## Determining the float32 format based on system endianness
-    fp32 = "<f4" if np.little_endian else ">f4"
-    ffmpeg_format = "f32le" if np.little_endian else "f32be"
+    fp32, ffmpeg_format = ("<f4", "f32le") if np.little_endian else \
+                          (">f4", "f32be")
     ffmpeg_acodec = f"pcm_{ffmpeg_format}"
+    ## Setting Keyword-based (non-positional) Args of `ffmpeg.output`
+    kwgs_output = {
+        "format": ffmpeg_format,
+        "acodec": ffmpeg_acodec,
+        ## Using original sample rate and channels if not specified
+        "ac": ch,
+        #"ar": fs, # Resampling later using `librosa`
+        "loglevel": "error"}
     ## Reading the Audio Asynchronously
     if read_async:
         async_pipe = (
             ffmpeg
             .input(filepath)
-            .output("pipe:",
-                format = ffmpeg_format,
-                acodec = ffmpeg_acodec,
-                ac = ch,
-                ar = fs,
-                loglevel = "error")
+            .output("pipe:", **kwgs_output)
             .run_async(pipe_stdout=True))
         pcm_raw = async_pipe.stdout.read()
         async_pipe.wait()
-        x_raw = np.frombuffer(pcm_raw, dtype=fp32)
     ## Reading the Audio Synchronously
     else:
-        pcm_out, _ = (
+        pcm_raw, _ = (
             ffmpeg
             .input(filepath)
-            .output("pipe:",
-                format = ffmpeg_format,
-                acodec = ffmpeg_acodec,
-                ac = ch,
-                ar = fs,
-                loglevel = "error")
+            .output("pipe:", **kwgs_output)
             .run(
                 capture_stdout = True,
                 capture_stderr = True,
                 ))
-        x_raw = np.frombuffer(pcm_out, dtype=fp32)
+    ## Converting the Raw PCM Data to Float32 NumPy Array
+    x_raw = np.frombuffer(pcm_raw, dtype=fp32).reshape(-1, ch).T
+    ## Resampling using `librosa`, if necessary
+    x_res = rosa.resample(x_raw, orig_sr=fs_origin, target_sr=fs,
+        res_type=res_type, axis=-1) if fs != fs_origin else x_raw
     ## Converting to the target data type if specified
     if dtype:
         dt = np.dtype(dtype)
         if dt.kind == "i":
-            x_raw = np.clip(x_raw, -1., +1.)
-            x_raw = (x_raw * (np.iinfo(dt).max - 1)).astype(dt)
+            x_res = np.clip(x_res, -1., +1.)
+            x_res = (x_res * (np.iinfo(dt).max - 1)).astype(dt)
     ## Returning with shapped as (channels, samples)
-    return x_raw.reshape(-1, ch).T, fs
+    return x_res, fs
+##
+def wavread_rosa(filepath:str,
+    fs:int=22050, mono:bool=True,
+    dtype=np.float32, res_type:str="soxr_hq",
+  **kwargs) -> tuple[np.ndarray, int]:
+    """
+    Reads an audio file using `ffmpeg` &
+        returns the audio data as an `np.ndarray` &
+        the sample rate (`fs`).
+    Args:
+        filepath (str): Path to the audio file.
+        fs (int, optional): Desired sample rate. Defaults to 22050.
+        mono (bool, optional): If True, converts the audio to mono.
+            Defaults to True.
+        dtype (data-type, optional): Desired data type
+            for the output array. If None, the data is returned
+            as 32-bit float. Defaults to `np.float32`.
+        res_type (str, optional): Resampling type
+            defined in `librosa`, defaulting to "soxr_hq".
+        **kwargs: Additional arguments.
+    Returns:
+        tuple[np.ndarray, int]: A tuple containing the audio data
+            as a NumPy array and the sample rate.
+    """
+    return wavread(filepath,
+        fs = fs,
+        ch = 1 if mono else 2,
+        dtype = dtype,
+        res_type = res_type,
+      **kwargs)
 ##

From ea236804e7551d53c8d2c542dd7614d676504650 Mon Sep 17 00:00:00 2001
From: hyperuser178 <hyperuser178@gmail.com>
Date: Sun, 14 Dec 2025 02:10:55 +0000
Subject: [PATCH 3/5] Fix `wavread_rosa` call during handling single channel
 audio

---
 rvc/lib/uvr5_pack/ffio.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/rvc/lib/uvr5_pack/ffio.py b/rvc/lib/uvr5_pack/ffio.py
index afdd208..dfe1354 100644
--- a/rvc/lib/uvr5_pack/ffio.py
+++ b/rvc/lib/uvr5_pack/ffio.py
@@ -104,10 +104,13 @@ def wavread_rosa(filepath:str,
         tuple[np.ndarray, int]: A tuple containing the audio data
             as a NumPy array and the sample rate.
     """
-    return wavread(filepath,
+    x, fs = wavread(filepath,
         fs = fs,
         ch = 1 if mono else 2,
         dtype = dtype,
         res_type = res_type,
       **kwargs)
+    if x.shape[0] < 2:
+        x = x.squeeze(0)
+    return x, fs
 ##

From e1192deabaae0b6b705c9f34b32123a5721427ca Mon Sep 17 00:00:00 2001
From: hyperuser178 <hyperuser178@gmail.com>
Date: Sun, 14 Dec 2025 02:18:11 +0000
Subject: [PATCH 4/5] Implement ffmpeg loading option for audio processing

Added option to load audio using ffmpeg in process method.
---
 rvc/modules/uvr5/vr.py | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/rvc/modules/uvr5/vr.py b/rvc/modules/uvr5/vr.py
index caebe31..1482b69 100644
--- a/rvc/modules/uvr5/vr.py
+++ b/rvc/modules/uvr5/vr.py
@@ -12,6 +12,7 @@
 from rvc.lib.uvr5_pack.lib_v5.model_param_init import ModelParameters
 from rvc.lib.uvr5_pack.lib_v5.nets_new import CascadedNet
 from rvc.lib.uvr5_pack.utils import inference
+from rvc.lib.uvr5_pack.ffio import wavread_rosa as ffread_rosa
 
 logger = logging.getLogger(__name__)
 
@@ -49,7 +50,11 @@ def __init__(self, model_path, agg, tta=False):
     def process(
         self,
         music_file,
-    ):
+        ## Param for trying to read audio using `ffmpeg`,
+        ##  but still resampling using `librosa.resample`,
+        ##  implemented in the file ".../uvr5_pack/ffio.py"
+        load_using_ffmpeg:bool = False,
+      **kwargs):
         x_wave, y_wave, x_spec_s, y_spec_s = {}, {}, {}, {}
         bands_n = len(self.mp.param["band"])
 
@@ -57,13 +62,22 @@ def process(
             bp = self.mp.param["band"][d]
             if d == bands_n:  # high-end band
                 # librosa loading may be buggy for some audio. ffmpeg will solve this, but it's a pain
-                x_wave[d] = librosa.core.load(
-                    music_file,
-                    sr=bp["sr"],
-                    mono=False,
-                    dtype=np.float32,
-                    res_type=bp["res_type"],
-                )[0]
+                if load_using_ffmpeg:
+                    x_wave[d] = ffread_rosa(
+                        music_file,
+                        fs = bp["sr"],
+                        mono = False,
+                        dtype = np.float32,
+                        res_type = bp["res_type"],
+                        )[0]
+                else:
+                    x_wave[d] = librosa.core.load(
+                        music_file,
+                        sr=bp["sr"],
+                        mono=False,
+                        dtype=np.float32,
+                        res_type=bp["res_type"],
+                        )[0]
                 if x_wave[d].ndim == 1:
                     x_wave[d] = np.asfortranarray([x_wave[d], x_wave[d]])
             else:  # lower bands

From 72396aa00fc448e2e402ed23e6e9522d67564359 Mon Sep 17 00:00:00 2001
From: hyperuser178 <hyperuser178@gmail.com>
Date: Sun, 14 Dec 2025 02:23:09 +0000
Subject: [PATCH 5/5] Added an audio loading method based on `ffmpeg` as an
 option

The function is implemented in new file `.../uvr5_pack/ffio.py`.
However, it hasn't been performed serious unit tests yet due to that I'm not sure what kind of unit tests are required.
---
 rvc/modules/uvr5/vr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rvc/modules/uvr5/vr.py b/rvc/modules/uvr5/vr.py
index 1482b69..8caefe0 100644
--- a/rvc/modules/uvr5/vr.py
+++ b/rvc/modules/uvr5/vr.py
@@ -62,7 +62,7 @@ def process(
             bp = self.mp.param["band"][d]
             if d == bands_n:  # high-end band
                 # librosa loading may be buggy for some audio. ffmpeg will solve this, but it's a pain
-                if load_using_ffmpeg:
+                if load_using_ffmpeg: # [TODO] Serious Unit Tests may be Required
                     x_wave[d] = ffread_rosa(
                         music_file,
                         fs = bp["sr"],