From 40e6971a69c2e7524a5ddc53ea25e6d0aee946e5 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 22 Sep 2025 17:00:03 -0400
Subject: [PATCH 01/10] CUDA 13 build enablement

---
 .github/scripts/build-cuda.sh        |  5 +++-
 .github/workflows/python-package.yml |  8 ++---
 CMakeLists.txt                       | 44 ++++++++++++++++------------
 csrc/kernels.cu                      | 32 ++++++++++++--------
 csrc/pythonInterface.cpp             |  9 ++++++
 5 files changed, 62 insertions(+), 36 deletions(-)

diff --git a/.github/scripts/build-cuda.sh b/.github/scripts/build-cuda.sh
index b13d9c92b..9eed06896 100644
--- a/.github/scripts/build-cuda.sh
+++ b/.github/scripts/build-cuda.sh
@@ -12,13 +12,16 @@ elif [ "${build_arch}" = "aarch64" ]; then
     build_capability="75;80;90"
 
     # CUDA 12.8+: Add sm100/sm120
-    [[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="75;80;90;100;120"
+    [[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* || "${cuda_version}" == 13.*.* ]] && build_capability="75;80;90;100;120"
 else
     # By default, target Pascal through Hopper.
     build_capability="60;70;75;80;86;89;90"
 
     # CUDA 12.8+: Add sm100 and sm120; remove < sm70 to align with PyTorch 2.8+cu128 minimum
     [[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="70;75;80;86;89;90;100;120"
+
+    # CUDA 13.0+: Remove < sm75 to align with PyTorch 2.9+cu130 minimum
+    [[ "${cuda_version}" == 13.*.* ]] && build_capability="75;80;86;89;90;100;120"
 fi
 
 [[ "${build_os}" = windows-* ]] && python3 -m pip install ninja
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index a11b13f33..8b50074ce 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -72,17 +72,17 @@ jobs:
           - os: windows-latest
             arch: x86_64
         cuda_version:
-          ["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1", "12.9.1"]
+          ["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1", "12.9.1", "13.0.1"]
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
         # Windows: We install Cuda on the agent (slow)
-      - uses: Jimver/cuda-toolkit@c35baa1a18fd1fc9dcf47c5bd839bf30559c0bc3 # v0.2.24
+      - uses: Jimver/cuda-toolkit@433d453c1fa37d10a3254452fa8e284441c9192d # v0.2.27
         if: startsWith(matrix.os, 'windows')
         id: cuda-toolkit
         with:
-          # Temporary: Use CUDA 12.9.0 for Windows until 12.9.1 is supported with this action.
-          cuda: ${{ matrix.cuda_version == '12.9.1' && '12.9.0' || matrix.cuda_version }}
+          # Temporary: Use CUDA 13.0.0 for Windows until 13.0.1 is supported with this action.
+          cuda: ${{ matrix.cuda_version == '13.0.1' && '13.0.0' || matrix.cuda_version }}
           method: "network"
           sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]'
           linux-local-args: '["--toolkit"]'
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 429570443..7c23a2799 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -113,30 +113,36 @@ if(BUILD_CUDA)
         )
     endif()
 
-    if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.4")
-        message(FATAL_ERROR "CUDA Version < 11.4 is not supported")
-    elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0")
-        message(FATAL_ERROR "CUDA Version > 12 is not supported")
+    if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.8")
+        message(FATAL_ERROR "CUDA Version < 11.8 is not supported")
+    elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "14.0")
+        message(FATAL_ERROR "CUDA Version > 13 is not supported")
     endif()
 
     # CMake < 3.23.0 does not define CMAKE_CUDA_ARCHITECTURES_ALL.
     if(CMAKE_VERSION VERSION_LESS "3.23.0")
         message(STATUS "CMake < 3.23.0; determining CUDA architectures supported...")
 
-        # 11.4+ supports these at a minimum.
-        set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80 86 87)
-        set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80)
-
-        # CUDA 11.8 adds support for Ada and Hopper.
-        if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8")
-            list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 89 90)
-            list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 90)
-        endif()
-
-        # CUDA 12.8 adds support for Blackwell.
-        if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.8")
-            list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 100 101 120)
-            list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 100 120)
+        if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0")
+            # Starting in CUDA 13.0, Thor Blackwell is renamed to SM110.
+            # Support for architectures older than Turing (SM75) is removed.
+            list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 75 80 86 87 88 89 90 100 103 110 120 121)
+            list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 80 90 100 110 120)
+        else()
+            # 11.8-12.9 supports these at a minimum.
+            set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80 86 87 89 90)
+            set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80 90)
+
+            # CUDA 12.8 adds support for Blackwell.
+            if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.8")
+                list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 100 101 120 121)
+                list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 100 120)
+            endif()
+
+            # CUDA 12.9 adds SM103 (Blackwell B300).
+            if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.9")
+                list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 103)
+            endif()
         endif()
     endif()
 
@@ -252,7 +258,7 @@ endif()
 
 set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX)
 add_library(bitsandbytes SHARED ${SRC_FILES})
-target_compile_features(bitsandbytes PUBLIC cxx_std_14)
+target_compile_features(bitsandbytes PUBLIC cxx_std_17)
 target_include_directories(bitsandbytes PUBLIC csrc include)
 
 
diff --git a/csrc/kernels.cu b/csrc/kernels.cu
index 738ae0cd1..2c232da80 100644
--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
@@ -16,6 +16,14 @@
 #include <math_constants.h>
 #include <mma.h>
 
+#if CCCL_VERSION >= 2008002
+#include <cuda/std/functional>
+#define CUB_REDUCTIONOP_MAX                                                                                            \
+    cuda::maximum<> {}
+#else
+#define CUB_REDUCTIONOP_MAX cub::Max()
+#endif
+
 #define HLF_MAX 65504
 #define TH 1024
 #define NUM 4
@@ -365,7 +373,7 @@ __global__ void kQuantizeBlockwise(
         for (int j = 0; j < NUM_PER_TH; j++)
             local_abs_max = fmaxf(local_abs_max, fabsf((float)vals[j]));
 
-        local_abs_max = BlockReduce(reduce).Reduce(local_abs_max, cub::Max(), valid_items);
+        local_abs_max = BlockReduce(reduce).Reduce(local_abs_max, CUB_REDUCTIONOP_MAX, valid_items);
 
         if (threadIdx.x == 0) {
             smem_absmax_value[0] = 1.0f / local_abs_max;
@@ -951,12 +959,12 @@ __global__ void __launch_bounds__(NUM_THREADS, 2) kPreconditionOptimizerStatic8b
     }
 
     __syncthreads();
-    local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, cub::Max(), valid_items);
+    local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, CUB_REDUCTIONOP_MAX, valid_items);
     __syncthreads();
-    local_max_s2 = BlockReduce(temp_storage.reduce).Reduce(local_max_s2, cub::Max(), valid_items);
+    local_max_s2 = BlockReduce(temp_storage.reduce).Reduce(local_max_s2, CUB_REDUCTIONOP_MAX, valid_items);
     if (unorm != NULL) {
         __syncthreads();
-        local_unorm = BlockReduce(temp_storage.reduce).Reduce(local_unorm, cub::Sum(), valid_items);
+        local_unorm = BlockReduce(temp_storage.reduce).Sum(local_unorm, valid_items);
     }
 
     if (threadIdx.x == 0) {
@@ -1162,13 +1170,13 @@ __global__ void __launch_bounds__(NUM_THREADS, 2) kPreconditionOptimizerStatic8b
     }
 
     __syncthreads();
-    local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, cub::Max(), valid_items);
+    local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, CUB_REDUCTIONOP_MAX, valid_items);
     if (threadIdx.x == 0) {
         atomicMax(&new_max1[0], local_max_s1);
     }
     if (unorm != NULL) {
         __syncthreads();
-        local_unorm = BlockReduce(temp_storage.reduce).Reduce(local_unorm, cub::Sum(), valid_items);
+        local_unorm = BlockReduce(temp_storage.reduce).Sum(local_unorm, valid_items);
         if (threadIdx.x == 0) {
             atomicAdd(&unorm[0], local_unorm);
         }
@@ -1473,11 +1481,11 @@ __launch_bounds__(256, 3) __global__ void kOptimizerStatic8bit2StateBlockwise(
         }
 
         //  reduce: 2.51/1.60 -> 2.67/1.69
-        new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, cub::Max());
-        new_local_abs_max2 = BlockReduce2(reduce2).Reduce(new_local_abs_max2, cub::Max());
+        new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, CUB_REDUCTIONOP_MAX);
+        new_local_abs_max2 = BlockReduce2(reduce2).Reduce(new_local_abs_max2, CUB_REDUCTIONOP_MAX);
 
         if (OPTIMIZER == ADEMAMIX) {
-            new_local_abs_max3 = BlockReduce3(reduce3).Reduce(new_local_abs_max3, cub::Max());
+            new_local_abs_max3 = BlockReduce3(reduce3).Reduce(new_local_abs_max3, CUB_REDUCTIONOP_MAX);
         }
 
         if (threadIdx.x == 0) {
@@ -1686,7 +1694,7 @@ __launch_bounds__(256, 3) __global__ void kOptimizerStatic8bit1StateBlockwise(
         }
 
         //  reduce: 2.51/1.60 -> 2.67/1.69
-        new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, cub::Max());
+        new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, CUB_REDUCTIONOP_MAX);
 
         if (threadIdx.x == 0)
             smem_exchange1[0] = new_local_abs_max1;
@@ -1792,7 +1800,7 @@ __launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__
     }
 
     // Reduce thread-local absmax across the block.
-    const TReduction row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, cub::Max(), cols);
+    const TReduction row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, CUB_REDUCTIONOP_MAX, cols);
     if (threadIdx.x == 0) {
         // Save our block's absmax to shared memory for the quantization step.
         rowStats[row_id] = smem_row_absmax = row_absmax;
@@ -1847,7 +1855,7 @@ __launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__
 
     // Reduce thread-local absmax across the block.
     // TODO: Consider algorithm BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY
-    const float row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, cub::Max(), cols);
+    const float row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, CUB_REDUCTIONOP_MAX, cols);
     if (threadIdx.x == 0) {
         // Save our block's absmax to shared memory for the quantization step.
         rowStats[row_id] = row_absmax;
diff --git a/csrc/pythonInterface.cpp b/csrc/pythonInterface.cpp
index b5d9afc6b..28121240f 100644
--- a/csrc/pythonInterface.cpp
+++ b/csrc/pythonInterface.cpp
@@ -4,6 +4,7 @@
 // LICENSE file in the root directory of this source tree.
 
 #if BUILD_CUDA
+#include <cuda_runtime_api.h>
 #include <ops.cuh>
 #endif
 #if BUILD_HIP
@@ -710,7 +711,15 @@ void cprefetch(void* ptr, size_t bytes, int device) {
     if (hasPrefetch == 0)
         return;
 
+#if CUDART_VERSION >= 13000
+    cudaMemLocation loc{};
+    loc.type = cudaMemLocationTypeDevice;
+    loc.id = device;
+    CUDA_CHECK_RETURN(cudaMemPrefetchAsync(ptr, bytes, loc, 0u, 0));
+#else
     CUDA_CHECK_RETURN(cudaMemPrefetchAsync(ptr, bytes, device, 0));
+#endif
+
     CUDA_CHECK_RETURN(cudaPeekAtLastError());
 }
 

From 5cee20b221c22c8d9dfcce0628a060caa1db56b2 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 22 Sep 2025 17:02:09 -0400
Subject: [PATCH 02/10] Try to fix Windows build workflow

---
 .github/workflows/python-package.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 8b50074ce..bc6038e93 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -87,6 +87,7 @@ jobs:
           sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]'
           linux-local-args: '["--toolkit"]'
           use-github-cache: false
+          use-local-cache: false
       - name: Setup MSVC
         if: startsWith(matrix.os, 'windows')
         uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl

From 4274bb5a1a22f941550f0f117d3e8606fe9e076e Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 22 Sep 2025 17:10:37 -0400
Subject: [PATCH 03/10] Try to fix Windows build workflow

---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index bc6038e93..a143ed317 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -83,7 +83,7 @@ jobs:
         with:
           # Temporary: Use CUDA 13.0.0 for Windows until 13.0.1 is supported with this action.
           cuda: ${{ matrix.cuda_version == '13.0.1' && '13.0.0' || matrix.cuda_version }}
-          method: "network"
+          method: "local"
           sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]'
           linux-local-args: '["--toolkit"]'
           use-github-cache: false

From 009ea8a47d7a0acad9cd004866241780bdc7db94 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 22 Sep 2025 17:20:06 -0400
Subject: [PATCH 04/10] Try to fix Windows build workflow

---
 .github/workflows/python-package.yml | 3 ++-
 .github/workflows/tests.yml          | 9 +++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index a143ed317..ba26d2167 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -77,7 +77,8 @@ jobs:
     steps:
       - uses: actions/checkout@v4
         # Windows: We install Cuda on the agent (slow)
-      - uses: Jimver/cuda-toolkit@433d453c1fa37d10a3254452fa8e284441c9192d # v0.2.27
+      #- uses: Jimver/cuda-toolkit@433d453c1fa37d10a3254452fa8e284441c9192d # v0.2.27
+      - uses: N-Storm/cuda-toolkit@d68ba29a800229200a2c3f572f9e816d7f67cdb4 # v0.2.24m
         if: startsWith(matrix.os, 'windows')
         id: cuda-toolkit
         with:
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 997da52bd..0266822ce 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -49,7 +49,7 @@ jobs:
   build-cuda:
     strategy:
       matrix:
-        cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"]
+        cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1", "13.0.1"]
         os: [ubuntu-22.04, ubuntu-22.04-arm]
         include:
           - os: ubuntu-22.04
@@ -343,7 +343,7 @@ jobs:
         os: [ubuntu-22.04, windows-2025]
         arch: [x86_64]
         gpu: [T4, L40S]
-        cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"]
+        cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1", "13.0.1"]
         include:
           - cuda_version: "11.8.0"
             torch_version: "2.3.1"
@@ -357,6 +357,9 @@ jobs:
           - cuda_version: "12.9.1"
             torch_version: "2.8.0"
             pypi_index: "https://download.pytorch.org/whl/cu129"
+          - cuda_version: "13.0.1"
+            torch_version: "2.9.0"
+            pypi_index: "https://download.pytorch.org/whl/test/cu130"
 
 
           # Linux L40S runners
@@ -395,6 +398,8 @@ jobs:
         exclude:
           # Our current T4 Windows runner has a driver too old (471.11)
           # and cannot support CUDA 12+. Skip for now.
+          - os: windows-2025
+            cuda_version: "13.0.1"
           - os: windows-2025
             cuda_version: "12.9.1"
           - os: windows-2025

From 82d82ef60e78fc281f04ddae036c826fe4063936 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 22 Sep 2025 17:29:06 -0400
Subject: [PATCH 05/10] Try to fix Windows build workflow

---
 .github/workflows/python-package.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index ba26d2167..8207aa072 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -85,10 +85,9 @@ jobs:
           # Temporary: Use CUDA 13.0.0 for Windows until 13.0.1 is supported with this action.
           cuda: ${{ matrix.cuda_version == '13.0.1' && '13.0.0' || matrix.cuda_version }}
           method: "local"
-          sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]'
-          linux-local-args: '["--toolkit"]'
           use-github-cache: false
           use-local-cache: false
+          log-file-suffix: ${{matrix.os}}-${{matrix.cuda_version}}.txt
       - name: Setup MSVC
         if: startsWith(matrix.os, 'windows')
         uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl

From 8dc83162da69ac43a48b4e9bb1d3eed639b18c92 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Tue, 23 Sep 2025 14:04:02 -0400
Subject: [PATCH 06/10] Add torch 2.9+cu130 to tests

---
 .github/workflows/tests.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 0266822ce..1662fb6df 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -1,4 +1,4 @@
-name: Unit tests
+name: Nightly Tests
 
 on:
   workflow_dispatch:
@@ -102,7 +102,7 @@ jobs:
       matrix:
         os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
         # Test with the oldest supported torch version, the newest two stable/RC.
-        torch_version: ["2.3.1", "2.7.1", "2.8.0"]
+        torch_version: ["2.3.1", "2.8.0", "2.9.0"]
         include:
           - os: ubuntu-22.04
             arch: x86_64
@@ -136,7 +136,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
-          python-version: 3.9
+          python-version: 3.10
 
       - name: Setup MSVC
         if: startsWith(matrix.os, 'windows')
@@ -144,7 +144,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/cpu
+          pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/${{ (matrix.torch_version == '2.9.0' && 'test/cpu') || 'cpu' }}
           pip install -e ".[test]"
           pip install pytest-cov
 
@@ -429,7 +429,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
-          python-version: 3.9
+          python-version: 3.10
 
       - name: Install dependencies
         run: |

From 4ec01be71b843702c9f71b8ead019d9279c2fc13 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Tue, 23 Sep 2025 14:07:51 -0400
Subject: [PATCH 07/10] Fix python version

---
 .github/workflows/tests.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 1662fb6df..59a8657b0 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -136,7 +136,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
-          python-version: 3.10
+          python-version: '3.10'
 
       - name: Setup MSVC
         if: startsWith(matrix.os, 'windows')
@@ -429,7 +429,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
-          python-version: 3.10
+          python-version: '3.10'
 
       - name: Install dependencies
         run: |

From e720b32098c822c9558db2686a18e6331255b2c6 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Tue, 23 Sep 2025 14:42:07 -0400
Subject: [PATCH 08/10] Update test workflow

---
 .github/workflows/tests.yml | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 59a8657b0..375c80bd9 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -49,7 +49,8 @@ jobs:
   build-cuda:
     strategy:
       matrix:
-        cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1", "13.0.1"]
+        # TODO: Add 13.0.1 when we have runners with new enough drivers.
+        cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"]
         os: [ubuntu-22.04, ubuntu-22.04-arm]
         include:
           - os: ubuntu-22.04
@@ -111,7 +112,7 @@ jobs:
             arch: aarch64
           - os: ubuntu-22.04-arm
             arch: aarch64
-            torch_version: "2.5.1"
+            torch_version: "2.5.1"    # Higher minimum requirement for aarch64
           - os: windows-2025
             arch: x86_64
           - os: macos-15
@@ -182,7 +183,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
-          python-version: 3.9
+          python-version: '3.10'
 
       - name: Install dependencies
         run: |
@@ -313,7 +314,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
-          python-version: 3.9
+          python-version: '3.10'
 
       - name: Install PyTorch
         run: pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/xpu
@@ -343,7 +344,7 @@ jobs:
         os: [ubuntu-22.04, windows-2025]
         arch: [x86_64]
         gpu: [T4, L40S]
-        cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1", "13.0.1"]
+        cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"] #, "13.0.1"]
         include:
           - cuda_version: "11.8.0"
             torch_version: "2.3.1"
@@ -351,15 +352,18 @@ jobs:
           - cuda_version: "12.6.3"
             torch_version: "2.6.0"
             pypi_index: "https://download.pytorch.org/whl/cu126"
-          - cuda_version: "12.8.1"
-            torch_version: "2.7.1"
-            pypi_index: "https://download.pytorch.org/whl/cu128"
           - cuda_version: "12.9.1"
             torch_version: "2.8.0"
             pypi_index: "https://download.pytorch.org/whl/cu129"
-          - cuda_version: "13.0.1"
+          - cuda_version: "12.8.1"
             torch_version: "2.9.0"
-            pypi_index: "https://download.pytorch.org/whl/test/cu130"
+            pypi_index: "https://download.pytorch.org/whl/test/cu128"
+
+          # Note: Currently our runners do not have new enough drivers for CUDA 13.
+          # Add this when supported.
+          # - cuda_version: "13.0.1"
+          #   torch_version: "2.9.0"
+          #   pypi_index: "https://download.pytorch.org/whl/test/cu130"
 
 
           # Linux L40S runners

From 6afa935934c32099d22821fbe883ea131bff3f1d Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Tue, 23 Sep 2025 15:11:10 -0400
Subject: [PATCH 09/10] Don't test CPU on torch 2.9 yet

---
 .github/workflows/tests.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 375c80bd9..bfceeab11 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -103,7 +103,7 @@ jobs:
       matrix:
         os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
         # Test with the oldest supported torch version, the newest two stable/RC.
-        torch_version: ["2.3.1", "2.8.0", "2.9.0"]
+        torch_version: ["2.3.1", "2.7.1", "2.8.0"]
         include:
           - os: ubuntu-22.04
             arch: x86_64
@@ -145,7 +145,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/${{ (matrix.torch_version == '2.9.0' && 'test/cpu') || 'cpu' }}
+          pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/cpu
           pip install -e ".[test]"
           pip install pytest-cov
 

From 9e4f73ec68c1e89bf9088726faf8f4c61ecbb8b4 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Tue, 23 Sep 2025 15:22:43 -0400
Subject: [PATCH 10/10] Update doc

---
 docs/source/installation.mdx | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index daa06a3c6..def8308bd 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -40,10 +40,12 @@ The currently distributed `bitsandbytes` packages are built with the following c
 |--------------------|------------------|----------------------|--------------
 | **Linux x86-64**   | 11.8 - 12.6      | GCC 11.2             | sm60, sm70, sm75, sm80, sm86, sm89, sm90
 | **Linux x86-64**   | 12.8 - 12.9      | GCC 11.2             | sm70, sm75, sm80, sm86, sm89, sm90, sm100, sm120
+| **Linux x86-64**   | 13.0             | GCC 11.2             | sm75, sm80, sm86, sm89, sm90, sm100, sm120
 | **Linux aarch64**  | 11.8 - 12.6      | GCC 11.2             | sm75, sm80, sm90
-| **Linux aarch64**  | 12.8 - 12.9      | GCC 11.2             | sm75, sm80, sm90, sm100, sm120
+| **Linux aarch64**  | 12.8 - 13.0      | GCC 11.2             | sm75, sm80, sm90, sm100, sm120
 | **Windows x86-64** | 11.8 - 12.6      | MSVC 19.43+ (VS2022) | sm50, sm60, sm75, sm80, sm86, sm89, sm90
 | **Windows x86-64** | 12.8 - 12.9      | MSVC 19.43+ (VS2022) | sm70, sm75, sm80, sm86, sm89, sm90, sm100, sm120
+| **Windows x86-64** | 13.0             | MSVC 19.43+ (VS2022) | sm75, sm80, sm86, sm89, sm90, sm100, sm120
 
 Use `pip` or `uv` to install: