From 39400a6c3b1339fe7306d5e3ba823cf9ba0b05a6 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Fri, 23 May 2025 15:51:31 -0400
Subject: [PATCH 1/5] General cleanup & test improvements

---
 .github/workflows/tests.yml              |  11 +-
 benchmarking/int8/row_scale_benchmark.py |  70 -------
 deploy.sh                                | 237 -----------------------
 environment-bnb.yml                      |  21 --
 environment.yml                          |  46 -----
 tests/test_autograd.py                   |   7 +
 tests/test_functional.py                 |   8 +-
 tests/test_ops.py                        |  33 ++--
 tests/test_triton.py                     |   2 +-
 9 files changed, 40 insertions(+), 395 deletions(-)
 delete mode 100644 benchmarking/int8/row_scale_benchmark.py
 delete mode 100644 deploy.sh
 delete mode 100644 environment-bnb.yml
 delete mode 100644 environment.yml

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 5d2a2708b..2d206b76a 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -93,14 +93,15 @@ jobs:
           path: output/${{ matrix.os }}/${{ matrix.arch }}/*
           retention-days: 7
 
-  cpu-tests:
+  test-cpu:
     if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
     needs: build-cpu
     strategy:
       fail-fast: false
       matrix:
         os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
-        torch_version: ["2.6.0", "2.7.0"]
+        # Test with the oldest supported torch version and the two newest.
+        torch_version: ["2.2.2", "2.6.0", "2.7.0"]
         include:
           - os: ubuntu-22.04
             arch: x86_64
@@ -144,7 +145,7 @@ jobs:
       - name: Run tests
         run: pytest --durations=100
 
-  # cuda-aarch64-tests:
+  # test-cuda-aarch64:
   #   if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
   #   needs: build-cuda
   #   strategy:
@@ -167,7 +168,7 @@ jobs:
 
 
 
-  cuda-tests:
+  test-cuda:
     if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
     needs: build-cuda
     strategy:
@@ -179,7 +180,7 @@ jobs:
         cuda_version: ["11.8.0", "12.6.3", "12.8.1"]
         include:
           - cuda_version: "11.8.0"
-            torch_version: "2.4.1"
+            torch_version: "2.2.2"
             pypi_index: "https://download.pytorch.org/whl/cu118"
           - cuda_version: "12.6.3"
             torch_version: "2.6.0"
diff --git a/benchmarking/int8/row_scale_benchmark.py b/benchmarking/int8/row_scale_benchmark.py
deleted file mode 100644
index 98d2496de..000000000
--- a/benchmarking/int8/row_scale_benchmark.py
+++ /dev/null
@@ -1,70 +0,0 @@
-"""
-Extracted from tests/test_functional.py
-
-Note: This feature is currently unused! It is kept here for archival purposes.
-
-Usage: pytest benchmarking/int8/row_scale_benchmark.py
-"""
-
-import time
-
-import pytest
-import torch
-
-from bitsandbytes import functional as F
-
-k = 20
-torch.set_printoptions(precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000)
-
-
-@pytest.mark.parametrize(
-    ("dim1", "dim4", "inner"),
-    [
-        pytest.param(1024, 12288 * 4, 12288, id="1024, 12288*4, 12288"),
-        pytest.param(2048, 4096 * 4, 4096, id="2048, 4096*4, 4096"),
-    ],
-)
-@pytest.mark.skip("Row scale has some bugs for ampere")
-@pytest.mark.benchmark
-def test_row_scale_bench(dim1, dim4, inner):
-    formatB = F.get_special_format_str()
-    err1, err2, err3 = [], [], []
-    relerr1, relerr2 = [], []
-    scale = 1
-    A = torch.randn(dim1, inner, device="cuda").half()
-    B = torch.randn(dim4, inner, device="cuda").half()
-    torch.nn.init.xavier_uniform_(B)
-    # warmpup
-    for i in range(k):
-        C1 = torch.matmul(A, B.t())
-
-    torch.cuda.synchronize()
-    t0 = time.time()
-    for i in range(k):
-        C1 = torch.matmul(A, B.t())
-    torch.cuda.synchronize()
-    print("16", time.time() - t0)
-
-    C1a, C1b, stats1a, stats1b, coo_tensor = F.int8_double_quant(A)
-    CB, absmaxB = F.vectorwise_quant(B, quant_type="linear")
-    A2, SA = F.nvidia_transform(C1a, "col32")
-    B2, SB = F.nvidia_transform(CB, formatB)
-    A1, maxA = F.vectorwise_quant(A, dim=1)
-
-    c = 10.0 * inner * scale
-    row_scale = maxA / c
-    torch.cuda.synchronize()
-    t0 = time.time()
-    for i in range(k):
-        outC32 = F.int8_linear_matmul(A2, B2, dtype=torch.int8, row_scale=row_scale)
-    torch.cuda.synchronize()
-    print("row-wise", time.time() - t0)
-
-    C2a, C2b, stats2a, stats2b, coo_tensor = F.int8_double_quant(B)
-    B2, SB = F.nvidia_transform(C2a, formatB)
-    torch.cuda.synchronize()
-    t0 = time.time()
-    for i in range(k):
-        outC32 = F.int8_linear_matmul(A2, B2)
-    torch.cuda.synchronize()
-    print("vector-wise", time.time() - t0)
diff --git a/deploy.sh b/deploy.sh
deleted file mode 100644
index e60373627..000000000
--- a/deploy.sh
+++ /dev/null
@@ -1,237 +0,0 @@
-#!/bin/bash
-BASE_PATH=$1
-
-echo "MAKE SURE LD_LIBRARY_PATH IS EMPTY!"
-echo $LD_LIBRARY_PATH
-
-if [[ ! -z "${LD_LIBRARY_PATH}" ]]; then
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-
-module unload cuda && echo "no module function available. Probably not on a slurm cluster."
-module unload gcc && echo "no module function available. Probably not on a slurm cluster."
-
-rm -rf dist build
-make cleaneggs
-make cleanlibs
-
-rm -rf build/*
-export CUDA_HOME=
-export CUDA_VERSION=
-make cpuonly CUDA_VERSION="CPU"
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cpu.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.0
-make cuda110 CUDA_VERSION=110
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda110.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.1
-make cuda11x CUDA_VERSION=111
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.4
-make cuda11x CUDA_VERSION=114
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda114.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.5
-make cuda11x CUDA_VERSION=115
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.7
-make cuda11x CUDA_VERSION=117
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda117.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.8
-make cuda118 CUDA_VERSION=118
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda118.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-12.0
-make cuda12x CUDA_VERSION=120
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda120.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-12.1
-make cuda12x CUDA_VERSION=121
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-12.2
-make cuda12x CUDA_VERSION=122
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda122.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-12.3
-make cuda12x CUDA_VERSION=123
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda123.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-############################# START NO CUBLASLT #############################################
-# binaries without 8-bit matmul support START HERE
-# ###########################################################################################
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.0
-make cuda110_nomatmul CUDA_VERSION=110
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda110_nocublaslt.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.1
-make cuda11x_nomatmul CUDA_VERSION=111
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111_nocublaslt.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.4
-make cuda11x_nomatmul CUDA_VERSION=114
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda114_nocublaslt.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.5
-make cuda11x_nomatmul CUDA_VERSION=115
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115_nocublaslt.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.7
-make cuda11x_nomatmul CUDA_VERSION=117
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda117_nocublaslt.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.8
-make cuda118_nomatmul CUDA_VERSION=118
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda118_nocublaslt.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-12.0
-make cuda12x_nomatmul CUDA_VERSION=120
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda120_nocublaslt.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-12.1
-make cuda12x_nomatmul CUDA_VERSION=121
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121_nocublaslt.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-12.2
-make cuda12x_nomatmul CUDA_VERSION=122
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda122_nocublaslt.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-12.3
-make cuda12x_nomatmul CUDA_VERSION=123
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda123_nocublaslt.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-python -m build
-python -m twine upload dist/* --verbose
diff --git a/environment-bnb.yml b/environment-bnb.yml
deleted file mode 100644
index 1214f7930..000000000
--- a/environment-bnb.yml
+++ /dev/null
@@ -1,21 +0,0 @@
-# for cmake build
-name: bnb
-channels:
-  - pytorch
-  - nvidia
-  - conda-forge
-
-dependencies:
-  - python
-  #- accelerate
-  #- einops
-  - scipy
-  #- transformers
-  - pytest
-  - pytest-cases
-  - ipython
-  - debugpy
-  - yapf
-  - monkeytype
-  - rich
-  - pytest-sugar
diff --git a/environment.yml b/environment.yml
deleted file mode 100644
index af421b3c6..000000000
--- a/environment.yml
+++ /dev/null
@@ -1,46 +0,0 @@
-name: bnb
-channels:
-  - pytorch
-  - nvidia
-  - conda-forge
-
-dependencies:
-  # Base
-  - conda-forge::python=3.8
-  - pytorch::pytorch=>2.1
-  - pytorch::pytorch-cuda=11.8
-  - nvidia::cuda=11.8
-  # Libraries
-  - conda-forge::accelerate
-  - conda-forge::einops
-  - conda-forge::scipy
-  - conda-forge::transformers
-  # Development
-  - conda-forge::pytest
-  - conda-forge::build        # build Python packages
-  - conda-forge::twine        # upload Python packages
-  - conda-forge::pytest-cases # more readable and composable parametrized tests
-  - conda-forge::ipython      # better interactive shell
-  - conda-forge::debugpy      # debugger-support for VSCode
-  - conda-forge::ruff         # linting
-  - conda-forge::yapf         # code formatting
-  - conda-forge::monkeytype   # infer type annotations
-  - conda-forge::rich         # better, colored tracebacks, etc
-  - conda-forge::pytest-sugar # better pytest output
-  # - conda-forge::nodejs       # for `doc-builder preview` (optional)
-
-## ENV CREATION - steps to reproduce:
-# mamba env remove -n bnb
-# mamba create -y -n bnb python=3.8 # creating an empty env bypasses conda
-# # and leads to much faster env resolution in the next step https://github.com/mamba-org/mamba/issues/633#issuecomment-812272143
-# mamba env update -n bnb -f environment.yml
-# mamba activate bnb
-
-## PIP dependencies (install *after* ENV CREATION):
-# pip install --no-cache-dir --no-deps lion_pytorch triton hf-doc-builder watchdog
-## NOTE: conda peft is not up to date, so we install from pip
-# cd pip install -e .  ## installs bitsandbytes as editable development install from within repo root dir
-
-## ENV UPDATE:
-# # add new packages to environment.yml, then:
-# mamba env update -n bnb -f environment.yml
diff --git a/tests/test_autograd.py b/tests/test_autograd.py
index b6ba284c9..fc2e7aa6f 100644
--- a/tests/test_autograd.py
+++ b/tests/test_autograd.py
@@ -49,6 +49,10 @@ def test_matmullt(
         req_grad = list(req_grad)
         req_grad[2] = False
 
+    if device == "cpu" and dtype != torch.float32 and has_fp16_weights and any(req_grad):
+        if torch.__version__ < (2, 6):
+            pytest.xfail("mse_loss bf16/fp16 on CPU is not supported in torch < 2.6")
+
     for i in range(3):
         # normal multiply
         if funcs[0] in [torch.mm, torch.matmul]:
@@ -185,6 +189,9 @@ def test_matmul_4bit(
         req_grad = list(req_grad)
         req_grad[2] = False
 
+    if device == "cpu" and dtype != torch.float32 and any(req_grad) and torch.__version__ < (2, 6):
+        pytest.xfail("mse_loss fp16 on CPU is not supported in torch < 2.6")
+
     for i in range(3):
         # normal multiply
         if funcs[0] in [torch.mm, torch.matmul]:
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 0b9390aaa..8568d45f0 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -1342,8 +1342,12 @@ def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double
     @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
     @pytest.mark.parametrize("double_quant", [False], ids=["DQ_True"])
     def test_gemv_eye_4bit(self, device, storage_type, dtype, double_quant):
-        if device == "cpu" and storage_type != "nf4":
-            pytest.xfail("fp4 quantization is not supported on CPU")
+        if device == "cpu":
+            if storage_type != "nf4":
+                pytest.xfail("fp4 quantization is not supported on CPU")
+
+            if dtype == torch.bfloat16 and torch.__version__ < (2, 3):
+                pytest.xfail("eye doe not support bfloat16 on CPU in torch < 2.3")
 
         dims = 10
         torch.random.manual_seed(np.random.randint(0, 412424242))
diff --git a/tests/test_ops.py b/tests/test_ops.py
index 4da1663f0..e85bc0ef0 100644
--- a/tests/test_ops.py
+++ b/tests/test_ops.py
@@ -6,6 +6,13 @@
 import bitsandbytes
 from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter
 
+# torch.library.opcheck is only available in torch 2.4 and later.
+# When testing with older versions, we will skip it as a no-op.
+if torch.__version__ >= (2, 4):
+    opcheck = torch.library.opcheck
+else:
+    opcheck = lambda *args, **kwargs: None
+
 
 class TestLLMInt8Ops:
     @pytest.mark.parametrize("device", get_available_devices())
@@ -18,7 +25,7 @@ def test_int8_linear_matmul(self, device):
         assert out.dtype == torch.int32
         assert out.device == A.device
 
-        torch.library.opcheck(torch.ops.bitsandbytes.int8_linear_matmul.default, (A, B))
+        opcheck(torch.ops.bitsandbytes.int8_linear_matmul.default, (A, B))
 
     @pytest.mark.parametrize("device", get_available_devices())
     def test_int8_linear_matmul_out(self, device):
@@ -32,7 +39,7 @@ def test_int8_linear_matmul_out(self, device):
         assert out.dtype == torch.int32
         assert out.device == A.device
 
-        torch.library.opcheck(torch.ops.bitsandbytes.int8_linear_matmul.out, (A, B, out))
+        opcheck(torch.ops.bitsandbytes.int8_linear_matmul.out, (A, B, out))
 
     @pytest.mark.parametrize("threshold", [0.0, 6.0])
     @pytest.mark.parametrize("device", get_available_devices())
@@ -57,9 +64,8 @@ def test_int8_vectorwise_quant(self, threshold, device):
         else:
             assert outlier_cols is None
 
-        torch.library.opcheck(torch.ops.bitsandbytes.int8_vectorwise_quant, (A,))
-
-        torch.library.opcheck(torch.ops.bitsandbytes.int8_vectorwise_quant, (A, threshold))
+        opcheck(torch.ops.bitsandbytes.int8_vectorwise_quant, (A,))
+        opcheck(torch.ops.bitsandbytes.int8_vectorwise_quant, (A, threshold))
 
     @pytest.mark.parametrize("device", get_available_devices())
     def test_int8_mm_dequant(self, device):
@@ -72,7 +78,7 @@ def test_int8_mm_dequant(self, device):
         assert out.dtype == torch.float16
         assert out.device == A.device
 
-        torch.library.opcheck(torch.ops.bitsandbytes.int8_mm_dequant, (A, row_stats, col_stats))
+        opcheck(torch.ops.bitsandbytes.int8_mm_dequant, (A, row_stats, col_stats))
 
     @pytest.mark.parametrize("device", get_available_devices())
     @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
@@ -89,7 +95,7 @@ def test_int8_scaled_mm(self, device, dtype, has_bias):
         assert out.dtype == dtype
         assert out.device == A.device
 
-        torch.library.opcheck(torch.ops.bitsandbytes.int8_scaled_mm, (A, B, row_stats, col_stats, bias, dtype))
+        opcheck(torch.ops.bitsandbytes.int8_scaled_mm, (A, B, row_stats, col_stats, bias, dtype))
 
 
 class TestInt8BlockwiseQuantOps:
@@ -115,7 +121,7 @@ def test_quantize_blockwise(self, device, dtype, blocksize):
         assert absmax.device == A.device
         assert absmax.dtype == torch.float32
 
-        torch.library.opcheck(torch.ops.bitsandbytes.quantize_blockwise, (A, code, blocksize))
+        opcheck(torch.ops.bitsandbytes.quantize_blockwise, (A, code, blocksize))
 
     @pytest.mark.parametrize("device", get_available_devices())
     @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
@@ -137,7 +143,7 @@ def test_dequantize_blockwise(self, device, dtype, blocksize):
         assert out.dtype == dtype
         assert out.device == A.device
 
-        torch.library.opcheck(torch.ops.bitsandbytes.dequantize_blockwise.default, (A, absmax, code, blocksize, dtype))
+        opcheck(torch.ops.bitsandbytes.dequantize_blockwise.default, (A, absmax, code, blocksize, dtype))
 
 
 class Test4bitBlockwiseQuantOps:
@@ -163,7 +169,7 @@ def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize
         assert absmax.device == A.device
         assert absmax.dtype == torch.float32
 
-        torch.library.opcheck(torch.ops.bitsandbytes.quantize_4bit, (A, blocksize, quant_type, storage_dtype))
+        opcheck(torch.ops.bitsandbytes.quantize_4bit, (A, blocksize, quant_type, storage_dtype))
 
     @pytest.mark.parametrize("device", get_available_devices())
     @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
@@ -198,8 +204,9 @@ def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksi
         assert out.device == A.device
         assert out.shape == shape
 
-        torch.library.opcheck(
-            torch.ops.bitsandbytes.dequantize_4bit.default, (A, absmax, blocksize, quant_type, shape, dtype)
+        opcheck(
+            torch.ops.bitsandbytes.dequantize_4bit.default,
+            (A, absmax, blocksize, quant_type, shape, dtype),
         )
 
     @pytest.mark.parametrize("device", get_available_devices())
@@ -226,4 +233,4 @@ def test_gemv_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
         assert out.shape == (1, 1, out_features)
         assert out.isreal().all()
 
-        torch.library.opcheck(torch.ops.bitsandbytes.gemv_4bit.default, (A, B_q, B.shape, absmax, code, blocksize))
+        opcheck(torch.ops.bitsandbytes.gemv_4bit.default, (A, B_q, B.shape, absmax, code, blocksize))
diff --git a/tests/test_triton.py b/tests/test_triton.py
index 70656a56f..b245e534a 100644
--- a/tests/test_triton.py
+++ b/tests/test_triton.py
@@ -11,7 +11,7 @@
     not is_triton_available() or not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] >= 8,
     reason="This test requires triton and a GPU with compute capability 8.0 or higher.",
 )
-@pytest.mark.skip("No longer supported.")
+@pytest.mark.deprecated
 @pytest.mark.parametrize("vector_wise_quantization", TRUE_FALSE)
 def test_switchback(vector_wise_quantization):
     for dim in [83]:

From bd49a46e0dbcb9f04b89d14b8d4df2cea9a6ee41 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Fri, 23 May 2025 16:20:19 -0400
Subject: [PATCH 2/5] Tests: WA numpy 2 compat issue for torch<2.3

---
 .github/workflows/tests.yml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 2d206b76a..df154335d 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -136,6 +136,11 @@ jobs:
           pip install -e ".[test]"
           pip install pytest-cov
 
+      # We need to downgrade to numpy<2 for torch<2.3 compatibility.
+      - name: Downgrade NumPy
+        if: startsWith(matrix.torch_version, '2.2.')
+        run: pip install "numpy<2"
+
       - name: Show installed packages
         run: pip list
 
@@ -239,6 +244,11 @@ jobs:
           pip install -e ".[test]"
           pip install pytest-cov
 
+        # We need to downgrade to numpy<2 for torch<2.3 compatibility.
+      - name: Downgrade NumPy
+        if: startsWith(matrix.torch_version, '2.2.')
+        run: pip install "numpy<2"
+
       - name: Show installed packages
         run: pip list
 

From 3fa8e9234d3d0b5550b0f5b0633f18725f41a27f Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Fri, 23 May 2025 17:18:40 -0400
Subject: [PATCH 3/5] Tests: update aarch64 cpu min torch version

---
 .github/workflows/tests.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index df154335d..a788c2bcb 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -108,10 +108,17 @@ jobs:
             runner: banb-aws-general-8-plus-use1-public-80
           - os: ubuntu-22.04-arm
             arch: aarch64
+          - os: ubuntu-22.04-arm
+            arch: aarch64
+            torch_version: "2.3.1"
           - os: windows-2025
             arch: x86_64
           - os: macos-15
             arch: arm64
+        exclude:
+          - os: ubuntu-22.04-arm
+            torch_version: "2.2.2"
+
     runs-on: ${{ matrix.runner || matrix.os }}
     env:
       BNB_TEST_DEVICE: cpu

From 2a11fa8b5a0639e2308d9ff0d35355a99f82965d Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Fri, 23 May 2025 17:31:20 -0400
Subject: [PATCH 4/5] Tests: update aarch64 cpu min torch version

---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index a788c2bcb..1939b9961 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -110,7 +110,7 @@ jobs:
             arch: aarch64
           - os: ubuntu-22.04-arm
             arch: aarch64
-            torch_version: "2.3.1"
+            torch_version: "2.4.1"
           - os: windows-2025
             arch: x86_64
           - os: macos-15

From 6c085212b6216bd50e86b690689ae54b526f743a Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Fri, 23 May 2025 18:06:39 -0400
Subject: [PATCH 5/5] Tests: update aarch64 cpu min torch version

---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 1939b9961..f1a5dca69 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -110,7 +110,7 @@ jobs:
             arch: aarch64
           - os: ubuntu-22.04-arm
             arch: aarch64
-            torch_version: "2.4.1"
+            torch_version: "2.5.1"
           - os: windows-2025
             arch: x86_64
           - os: macos-15