From 40e6971a69c2e7524a5ddc53ea25e6d0aee946e5 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Mon, 22 Sep 2025 17:00:03 -0400 Subject: [PATCH 01/10] CUDA 13 build enablement --- .github/scripts/build-cuda.sh | 5 +++- .github/workflows/python-package.yml | 8 ++--- CMakeLists.txt | 44 ++++++++++++++++------------ csrc/kernels.cu | 32 ++++++++++++-------- csrc/pythonInterface.cpp | 9 ++++++ 5 files changed, 62 insertions(+), 36 deletions(-) diff --git a/.github/scripts/build-cuda.sh b/.github/scripts/build-cuda.sh index b13d9c92b..9eed06896 100644 --- a/.github/scripts/build-cuda.sh +++ b/.github/scripts/build-cuda.sh @@ -12,13 +12,16 @@ elif [ "${build_arch}" = "aarch64" ]; then build_capability="75;80;90" # CUDA 12.8+: Add sm100/sm120 - [[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="75;80;90;100;120" + [[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* || "${cuda_version}" == 13.*.* ]] && build_capability="75;80;90;100;120" else # By default, target Pascal through Hopper. build_capability="60;70;75;80;86;89;90" # CUDA 12.8+: Add sm100 and sm120; remove < sm70 to align with PyTorch 2.8+cu128 minimum [[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="70;75;80;86;89;90;100;120" + + # CUDA 13.0+: Remove < sm75 to align with PyTorch 2.9+cu130 minimum + [[ "${cuda_version}" == 13.*.* ]] && build_capability="75;80;86;89;90;100;120" fi [[ "${build_os}" = windows-* ]] && python3 -m pip install ninja diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index a11b13f33..8b50074ce 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -72,17 +72,17 @@ jobs: - os: windows-latest arch: x86_64 cuda_version: - ["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1", "12.9.1"] + ["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1", "12.9.1", "13.0.1"] runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 # Windows: We install Cuda on the agent (slow) - - uses: Jimver/cuda-toolkit@c35baa1a18fd1fc9dcf47c5bd839bf30559c0bc3 # v0.2.24 + - uses: Jimver/cuda-toolkit@433d453c1fa37d10a3254452fa8e284441c9192d # v0.2.27 if: startsWith(matrix.os, 'windows') id: cuda-toolkit with: - # Temporary: Use CUDA 12.9.0 for Windows until 12.9.1 is supported with this action. - cuda: ${{ matrix.cuda_version == '12.9.1' && '12.9.0' || matrix.cuda_version }} + # Temporary: Use CUDA 13.0.0 for Windows until 13.0.1 is supported with this action. + cuda: ${{ matrix.cuda_version == '13.0.1' && '13.0.0' || matrix.cuda_version }} method: "network" sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]' linux-local-args: '["--toolkit"]' diff --git a/CMakeLists.txt b/CMakeLists.txt index 429570443..7c23a2799 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -113,30 +113,36 @@ if(BUILD_CUDA) ) endif() - if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.4") - message(FATAL_ERROR "CUDA Version < 11.4 is not supported") - elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") - message(FATAL_ERROR "CUDA Version > 12 is not supported") + if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.8") + message(FATAL_ERROR "CUDA Version < 11.8 is not supported") + elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "14.0") + message(FATAL_ERROR "CUDA Version > 13 is not supported") endif() # CMake < 3.23.0 does not define CMAKE_CUDA_ARCHITECTURES_ALL. if(CMAKE_VERSION VERSION_LESS "3.23.0") message(STATUS "CMake < 3.23.0; determining CUDA architectures supported...") - # 11.4+ supports these at a minimum. - set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80 86 87) - set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80) - - # CUDA 11.8 adds support for Ada and Hopper. - if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8") - list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 89 90) - list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 90) - endif() - - # CUDA 12.8 adds support for Blackwell. - if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.8") - list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 100 101 120) - list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 100 120) + if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") + # Starting in CUDA 13.0, Thor Blackwell is renamed to SM110. + # Support for architectures older than Turing (SM75) is removed. + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 75 80 86 87 88 89 90 100 103 110 120 121) + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 80 90 100 110 120) + else() + # 11.8-12.9 supports these at a minimum. + set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80 86 87 89 90) + set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80 90) + + # CUDA 12.8 adds support for Blackwell. + if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.8") + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 100 101 120 121) + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 100 120) + endif() + + # CUDA 12.9 adds SM103 (Blackwell B300). + if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.9") + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 103) + endif() endif() endif() @@ -252,7 +258,7 @@ endif() set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX) add_library(bitsandbytes SHARED ${SRC_FILES}) -target_compile_features(bitsandbytes PUBLIC cxx_std_14) +target_compile_features(bitsandbytes PUBLIC cxx_std_17) target_include_directories(bitsandbytes PUBLIC csrc include) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 738ae0cd1..2c232da80 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -16,6 +16,14 @@ #include #include +#if CCCL_VERSION >= 2008002 +#include +#define CUB_REDUCTIONOP_MAX \ + cuda::maximum<> {} +#else +#define CUB_REDUCTIONOP_MAX cub::Max() +#endif + #define HLF_MAX 65504 #define TH 1024 #define NUM 4 @@ -365,7 +373,7 @@ __global__ void kQuantizeBlockwise( for (int j = 0; j < NUM_PER_TH; j++) local_abs_max = fmaxf(local_abs_max, fabsf((float)vals[j])); - local_abs_max = BlockReduce(reduce).Reduce(local_abs_max, cub::Max(), valid_items); + local_abs_max = BlockReduce(reduce).Reduce(local_abs_max, CUB_REDUCTIONOP_MAX, valid_items); if (threadIdx.x == 0) { smem_absmax_value[0] = 1.0f / local_abs_max; @@ -951,12 +959,12 @@ __global__ void __launch_bounds__(NUM_THREADS, 2) kPreconditionOptimizerStatic8b } __syncthreads(); - local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, cub::Max(), valid_items); + local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, CUB_REDUCTIONOP_MAX, valid_items); __syncthreads(); - local_max_s2 = BlockReduce(temp_storage.reduce).Reduce(local_max_s2, cub::Max(), valid_items); + local_max_s2 = BlockReduce(temp_storage.reduce).Reduce(local_max_s2, CUB_REDUCTIONOP_MAX, valid_items); if (unorm != NULL) { __syncthreads(); - local_unorm = BlockReduce(temp_storage.reduce).Reduce(local_unorm, cub::Sum(), valid_items); + local_unorm = BlockReduce(temp_storage.reduce).Sum(local_unorm, valid_items); } if (threadIdx.x == 0) { @@ -1162,13 +1170,13 @@ __global__ void __launch_bounds__(NUM_THREADS, 2) kPreconditionOptimizerStatic8b } __syncthreads(); - local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, cub::Max(), valid_items); + local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, CUB_REDUCTIONOP_MAX, valid_items); if (threadIdx.x == 0) { atomicMax(&new_max1[0], local_max_s1); } if (unorm != NULL) { __syncthreads(); - local_unorm = BlockReduce(temp_storage.reduce).Reduce(local_unorm, cub::Sum(), valid_items); + local_unorm = BlockReduce(temp_storage.reduce).Sum(local_unorm, valid_items); if (threadIdx.x == 0) { atomicAdd(&unorm[0], local_unorm); } @@ -1473,11 +1481,11 @@ __launch_bounds__(256, 3) __global__ void kOptimizerStatic8bit2StateBlockwise( } // reduce: 2.51/1.60 -> 2.67/1.69 - new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, cub::Max()); - new_local_abs_max2 = BlockReduce2(reduce2).Reduce(new_local_abs_max2, cub::Max()); + new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, CUB_REDUCTIONOP_MAX); + new_local_abs_max2 = BlockReduce2(reduce2).Reduce(new_local_abs_max2, CUB_REDUCTIONOP_MAX); if (OPTIMIZER == ADEMAMIX) { - new_local_abs_max3 = BlockReduce3(reduce3).Reduce(new_local_abs_max3, cub::Max()); + new_local_abs_max3 = BlockReduce3(reduce3).Reduce(new_local_abs_max3, CUB_REDUCTIONOP_MAX); } if (threadIdx.x == 0) { @@ -1686,7 +1694,7 @@ __launch_bounds__(256, 3) __global__ void kOptimizerStatic8bit1StateBlockwise( } // reduce: 2.51/1.60 -> 2.67/1.69 - new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, cub::Max()); + new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, CUB_REDUCTIONOP_MAX); if (threadIdx.x == 0) smem_exchange1[0] = new_local_abs_max1; @@ -1792,7 +1800,7 @@ __launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__ } // Reduce thread-local absmax across the block. - const TReduction row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, cub::Max(), cols); + const TReduction row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, CUB_REDUCTIONOP_MAX, cols); if (threadIdx.x == 0) { // Save our block's absmax to shared memory for the quantization step. rowStats[row_id] = smem_row_absmax = row_absmax; @@ -1847,7 +1855,7 @@ __launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__ // Reduce thread-local absmax across the block. // TODO: Consider algorithm BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY - const float row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, cub::Max(), cols); + const float row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, CUB_REDUCTIONOP_MAX, cols); if (threadIdx.x == 0) { // Save our block's absmax to shared memory for the quantization step. rowStats[row_id] = row_absmax; diff --git a/csrc/pythonInterface.cpp b/csrc/pythonInterface.cpp index b5d9afc6b..28121240f 100644 --- a/csrc/pythonInterface.cpp +++ b/csrc/pythonInterface.cpp @@ -4,6 +4,7 @@ // LICENSE file in the root directory of this source tree. #if BUILD_CUDA +#include #include #endif #if BUILD_HIP @@ -710,7 +711,15 @@ void cprefetch(void* ptr, size_t bytes, int device) { if (hasPrefetch == 0) return; +#if CUDART_VERSION >= 13000 + cudaMemLocation loc{}; + loc.type = cudaMemLocationTypeDevice; + loc.id = device; + CUDA_CHECK_RETURN(cudaMemPrefetchAsync(ptr, bytes, loc, 0u, 0)); +#else CUDA_CHECK_RETURN(cudaMemPrefetchAsync(ptr, bytes, device, 0)); +#endif + CUDA_CHECK_RETURN(cudaPeekAtLastError()); } From 5cee20b221c22c8d9dfcce0628a060caa1db56b2 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Mon, 22 Sep 2025 17:02:09 -0400 Subject: [PATCH 02/10] Try to fix Windows build workflow --- .github/workflows/python-package.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 8b50074ce..bc6038e93 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -87,6 +87,7 @@ jobs: sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]' linux-local-args: '["--toolkit"]' use-github-cache: false + use-local-cache: false - name: Setup MSVC if: startsWith(matrix.os, 'windows') uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl From 4274bb5a1a22f941550f0f117d3e8606fe9e076e Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Mon, 22 Sep 2025 17:10:37 -0400 Subject: [PATCH 03/10] Try to fix Windows build workflow --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index bc6038e93..a143ed317 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -83,7 +83,7 @@ jobs: with: # Temporary: Use CUDA 13.0.0 for Windows until 13.0.1 is supported with this action. cuda: ${{ matrix.cuda_version == '13.0.1' && '13.0.0' || matrix.cuda_version }} - method: "network" + method: "local" sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]' linux-local-args: '["--toolkit"]' use-github-cache: false From 009ea8a47d7a0acad9cd004866241780bdc7db94 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Mon, 22 Sep 2025 17:20:06 -0400 Subject: [PATCH 04/10] Try to fix Windows build workflow --- .github/workflows/python-package.yml | 3 ++- .github/workflows/tests.yml | 9 +++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index a143ed317..ba26d2167 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -77,7 +77,8 @@ jobs: steps: - uses: actions/checkout@v4 # Windows: We install Cuda on the agent (slow) - - uses: Jimver/cuda-toolkit@433d453c1fa37d10a3254452fa8e284441c9192d # v0.2.27 + #- uses: Jimver/cuda-toolkit@433d453c1fa37d10a3254452fa8e284441c9192d # v0.2.27 + - uses: N-Storm/cuda-toolkit@d68ba29a800229200a2c3f572f9e816d7f67cdb4 # v0.2.24m if: startsWith(matrix.os, 'windows') id: cuda-toolkit with: diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 997da52bd..0266822ce 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -49,7 +49,7 @@ jobs: build-cuda: strategy: matrix: - cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"] + cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1", "13.0.1"] os: [ubuntu-22.04, ubuntu-22.04-arm] include: - os: ubuntu-22.04 @@ -343,7 +343,7 @@ jobs: os: [ubuntu-22.04, windows-2025] arch: [x86_64] gpu: [T4, L40S] - cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"] + cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1", "13.0.1"] include: - cuda_version: "11.8.0" torch_version: "2.3.1" @@ -357,6 +357,9 @@ jobs: - cuda_version: "12.9.1" torch_version: "2.8.0" pypi_index: "https://download.pytorch.org/whl/cu129" + - cuda_version: "13.0.1" + torch_version: "2.9.0" + pypi_index: "https://download.pytorch.org/whl/test/cu130" # Linux L40S runners @@ -395,6 +398,8 @@ jobs: exclude: # Our current T4 Windows runner has a driver too old (471.11) # and cannot support CUDA 12+. Skip for now. + - os: windows-2025 + cuda_version: "13.0.1" - os: windows-2025 cuda_version: "12.9.1" - os: windows-2025 From 82d82ef60e78fc281f04ddae036c826fe4063936 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Mon, 22 Sep 2025 17:29:06 -0400 Subject: [PATCH 05/10] Try to fix Windows build workflow --- .github/workflows/python-package.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index ba26d2167..8207aa072 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -85,10 +85,9 @@ jobs: # Temporary: Use CUDA 13.0.0 for Windows until 13.0.1 is supported with this action. cuda: ${{ matrix.cuda_version == '13.0.1' && '13.0.0' || matrix.cuda_version }} method: "local" - sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]' - linux-local-args: '["--toolkit"]' use-github-cache: false use-local-cache: false + log-file-suffix: ${{matrix.os}}-${{matrix.cuda_version}}.txt - name: Setup MSVC if: startsWith(matrix.os, 'windows') uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl From 8dc83162da69ac43a48b4e9bb1d3eed639b18c92 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Tue, 23 Sep 2025 14:04:02 -0400 Subject: [PATCH 06/10] Add torch 2.9+cu130 to tests --- .github/workflows/tests.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 0266822ce..1662fb6df 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,4 +1,4 @@ -name: Unit tests +name: Nightly Tests on: workflow_dispatch: @@ -102,7 +102,7 @@ jobs: matrix: os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15] # Test with the oldest supported torch version, the newest two stable/RC. - torch_version: ["2.3.1", "2.7.1", "2.8.0"] + torch_version: ["2.3.1", "2.8.0", "2.9.0"] include: - os: ubuntu-22.04 arch: x86_64 @@ -136,7 +136,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: 3.9 + python-version: 3.10 - name: Setup MSVC if: startsWith(matrix.os, 'windows') @@ -144,7 +144,7 @@ jobs: - name: Install dependencies run: | - pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/cpu + pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/${{ (matrix.torch_version == '2.9.0' && 'test/cpu') || 'cpu' }} pip install -e ".[test]" pip install pytest-cov @@ -429,7 +429,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: 3.9 + python-version: 3.10 - name: Install dependencies run: | From 4ec01be71b843702c9f71b8ead019d9279c2fc13 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Tue, 23 Sep 2025 14:07:51 -0400 Subject: [PATCH 07/10] Fix python version --- .github/workflows/tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 1662fb6df..59a8657b0 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -136,7 +136,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: 3.10 + python-version: '3.10' - name: Setup MSVC if: startsWith(matrix.os, 'windows') @@ -429,7 +429,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: 3.10 + python-version: '3.10' - name: Install dependencies run: | From e720b32098c822c9558db2686a18e6331255b2c6 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Tue, 23 Sep 2025 14:42:07 -0400 Subject: [PATCH 08/10] Update test workflow --- .github/workflows/tests.yml | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 59a8657b0..375c80bd9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -49,7 +49,8 @@ jobs: build-cuda: strategy: matrix: - cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1", "13.0.1"] + # TODO: Add 13.0.1 when we have runners with new enough drivers. + cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"] os: [ubuntu-22.04, ubuntu-22.04-arm] include: - os: ubuntu-22.04 @@ -111,7 +112,7 @@ jobs: arch: aarch64 - os: ubuntu-22.04-arm arch: aarch64 - torch_version: "2.5.1" + torch_version: "2.5.1" # Higher minimum requirement for aarch64 - os: windows-2025 arch: x86_64 - os: macos-15 @@ -182,7 +183,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: 3.9 + python-version: '3.10' - name: Install dependencies run: | @@ -313,7 +314,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: 3.9 + python-version: '3.10' - name: Install PyTorch run: pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/xpu @@ -343,7 +344,7 @@ jobs: os: [ubuntu-22.04, windows-2025] arch: [x86_64] gpu: [T4, L40S] - cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1", "13.0.1"] + cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"] #, "13.0.1"] include: - cuda_version: "11.8.0" torch_version: "2.3.1" @@ -351,15 +352,18 @@ jobs: - cuda_version: "12.6.3" torch_version: "2.6.0" pypi_index: "https://download.pytorch.org/whl/cu126" - - cuda_version: "12.8.1" - torch_version: "2.7.1" - pypi_index: "https://download.pytorch.org/whl/cu128" - cuda_version: "12.9.1" torch_version: "2.8.0" pypi_index: "https://download.pytorch.org/whl/cu129" - - cuda_version: "13.0.1" + - cuda_version: "12.8.1" torch_version: "2.9.0" - pypi_index: "https://download.pytorch.org/whl/test/cu130" + pypi_index: "https://download.pytorch.org/whl/test/cu128" + + # Note: Currently our runners do not have new enough drivers for CUDA 13. + # Add this when supported. + # - cuda_version: "13.0.1" + # torch_version: "2.9.0" + # pypi_index: "https://download.pytorch.org/whl/test/cu130" # Linux L40S runners From 6afa935934c32099d22821fbe883ea131bff3f1d Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Tue, 23 Sep 2025 15:11:10 -0400 Subject: [PATCH 09/10] Don't test CPU on torch 2.9 yet --- .github/workflows/tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 375c80bd9..bfceeab11 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -103,7 +103,7 @@ jobs: matrix: os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15] # Test with the oldest supported torch version, the newest two stable/RC. - torch_version: ["2.3.1", "2.8.0", "2.9.0"] + torch_version: ["2.3.1", "2.7.1", "2.8.0"] include: - os: ubuntu-22.04 arch: x86_64 @@ -145,7 +145,7 @@ jobs: - name: Install dependencies run: | - pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/${{ (matrix.torch_version == '2.9.0' && 'test/cpu') || 'cpu' }} + pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/cpu pip install -e ".[test]" pip install pytest-cov From 9e4f73ec68c1e89bf9088726faf8f4c61ecbb8b4 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Tue, 23 Sep 2025 15:22:43 -0400 Subject: [PATCH 10/10] Update doc --- docs/source/installation.mdx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index daa06a3c6..def8308bd 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -40,10 +40,12 @@ The currently distributed `bitsandbytes` packages are built with the following c |--------------------|------------------|----------------------|-------------- | **Linux x86-64** | 11.8 - 12.6 | GCC 11.2 | sm60, sm70, sm75, sm80, sm86, sm89, sm90 | **Linux x86-64** | 12.8 - 12.9 | GCC 11.2 | sm70, sm75, sm80, sm86, sm89, sm90, sm100, sm120 +| **Linux x86-64** | 13.0 | GCC 11.2 | sm75, sm80, sm86, sm89, sm90, sm100, sm120 | **Linux aarch64** | 11.8 - 12.6 | GCC 11.2 | sm75, sm80, sm90 -| **Linux aarch64** | 12.8 - 12.9 | GCC 11.2 | sm75, sm80, sm90, sm100, sm120 +| **Linux aarch64** | 12.8 - 13.0 | GCC 11.2 | sm75, sm80, sm90, sm100, sm120 | **Windows x86-64** | 11.8 - 12.6 | MSVC 19.43+ (VS2022) | sm50, sm60, sm75, sm80, sm86, sm89, sm90 | **Windows x86-64** | 12.8 - 12.9 | MSVC 19.43+ (VS2022) | sm70, sm75, sm80, sm86, sm89, sm90, sm100, sm120 +| **Windows x86-64** | 13.0 | MSVC 19.43+ (VS2022) | sm75, sm80, sm86, sm89, sm90, sm100, sm120 Use `pip` or `uv` to install: