From 0fe5c1defeca49dcbd394c3078df187d1f735418 Mon Sep 17 00:00:00 2001 From: Johnny Date: Sun, 24 Aug 2025 00:28:54 +0200 Subject: [PATCH 01/18] Update CMakeLists.txt --- CMakeLists.txt | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 770b4ba30..608e8e53e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,8 +106,6 @@ if(BUILD_CUDA) if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.4") message(FATAL_ERROR "CUDA Version < 11.4 is not supported") - elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") - message(FATAL_ERROR "CUDA Version > 12 is not supported") endif() # CMake < 3.23.0 does not define CMAKE_CUDA_ARCHITECTURES_ALL. @@ -126,8 +124,12 @@ if(BUILD_CUDA) # CUDA 12.8 adds support for Blackwell. if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.8") - list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 100 101 120) + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 100 120) list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 100 120) + endif() + if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 103 110 121) + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 103 110 121) endif() endif() From 6f93e1918b5a24c244b56b2bc314073c343f3b5a Mon Sep 17 00:00:00 2001 From: Johnny Date: Sun, 24 Aug 2025 00:33:33 +0200 Subject: [PATCH 02/18] Update CMakeLists.txt --- CMakeLists.txt | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 608e8e53e..9315d5ea4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -113,8 +113,13 @@ if(BUILD_CUDA) message(STATUS "CMake < 3.23.0; determining CUDA architectures supported...") # 11.4+ supports these at a minimum. - set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80 86 87) - set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80) + if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8") + set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80 86 87) + set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80) + else() + set(CMAKE_CUDA_ARCHITECTURES_ALL 80 86 87) + set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 80) + endif() # CUDA 11.8 adds support for Ada and Hopper. if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8") From 9e310220a868f93f311f12a62ca2636c776f7ad1 Mon Sep 17 00:00:00 2001 From: Johnny Date: Sun, 24 Aug 2025 00:36:30 +0200 Subject: [PATCH 03/18] Update CMakeLists.txt --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9315d5ea4..75c7b3826 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -113,7 +113,7 @@ if(BUILD_CUDA) message(STATUS "CMake < 3.23.0; determining CUDA architectures supported...") # 11.4+ supports these at a minimum. - if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8") + if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "13.0") set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80 86 87) set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80) else() From 44caa8f2341a5c626c1242fd7387d252f59bde82 Mon Sep 17 00:00:00 2001 From: Johnny Date: Sun, 24 Aug 2025 00:42:36 +0200 Subject: [PATCH 04/18] Update CMakeLists.txt --- CMakeLists.txt | 75 +++++++++++++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 29 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 75c7b3826..c10ac6443 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -108,35 +108,52 @@ if(BUILD_CUDA) message(FATAL_ERROR "CUDA Version < 11.4 is not supported") endif() - # CMake < 3.23.0 does not define CMAKE_CUDA_ARCHITECTURES_ALL. - if(CMAKE_VERSION VERSION_LESS "3.23.0") - message(STATUS "CMake < 3.23.0; determining CUDA architectures supported...") - - # 11.4+ supports these at a minimum. - if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "13.0") - set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80 86 87) - set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80) - else() - set(CMAKE_CUDA_ARCHITECTURES_ALL 80 86 87) - set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 80) - endif() - - # CUDA 11.8 adds support for Ada and Hopper. - if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8") - list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 89 90) - list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 90) - endif() - - # CUDA 12.8 adds support for Blackwell. - if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.8") - list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 100 120) - list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 100 120) - endif() - if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") - list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 103 110 121) - list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 103 110 121) - endif() - endif() + # For CMake < 3.23.0: manually determine supported CUDA architectures + if (CMAKE_VERSION VERSION_LESS "3.23.0") + message(STATUS "CMake < 3.23.0; determining CUDA architectures supported...") + + # Base (CUDA 11.4+) + set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80 86 87) + set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80) + + # CUDA 11.8 adds Ada/Hopper + if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8") + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 89 90) + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 90) + endif() + + # CUDA 12.8 adds Blackwell + if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.8") + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 100 120) + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 100 120) + endif() + + # CUDA 13: drop all architectures <= 80-series + if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") + # Add new SMs introduced in CUDA 13 + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 103 110 121) + + # Remove all SMs below 86 (including 80-series) + list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES_ALL + 50 52 53 60 61 62 70 72 75 80) + + # Remove majors below 90 + list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80) + + # Optionally, keep majors as round numbers only in CUDA 13+ + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 90 100 120) + endif() + endif() + + # Cleanup: remove duplicates and sort naturally + list(REMOVE_DUPLICATES CMAKE_CUDA_ARCHITECTURES_ALL) + list(REMOVE_DUPLICATES CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR) + list(SORT CMAKE_CUDA_ARCHITECTURES_ALL COMPARE NATURAL) + list(SORT CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR COMPARE NATURAL) + + # (Optional) Use these SMs directly for compilation + # set(CMAKE_CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES_ALL}" CACHE STRING "" FORCE) +endif() string(APPEND CMAKE_CUDA_FLAGS " --use_fast_math") From b5517e1e0e7f061c8bd792ac76d5ddf021d39f41 Mon Sep 17 00:00:00 2001 From: Johnny Date: Sun, 24 Aug 2025 00:46:13 +0200 Subject: [PATCH 05/18] Update CMakeLists.txt --- CMakeLists.txt | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c10ac6443..3258c4ab4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -143,17 +143,16 @@ if(BUILD_CUDA) # Optionally, keep majors as round numbers only in CUDA 13+ list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 90 100 120) endif() - endif() - # Cleanup: remove duplicates and sort naturally - list(REMOVE_DUPLICATES CMAKE_CUDA_ARCHITECTURES_ALL) - list(REMOVE_DUPLICATES CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR) - list(SORT CMAKE_CUDA_ARCHITECTURES_ALL COMPARE NATURAL) - list(SORT CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR COMPARE NATURAL) - - # (Optional) Use these SMs directly for compilation - # set(CMAKE_CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES_ALL}" CACHE STRING "" FORCE) -endif() + # Cleanup: remove duplicates and sort naturally + list(REMOVE_DUPLICATES CMAKE_CUDA_ARCHITECTURES_ALL) + list(REMOVE_DUPLICATES CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR) + list(SORT CMAKE_CUDA_ARCHITECTURES_ALL COMPARE NATURAL) + list(SORT CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR COMPARE NATURAL) + + # (Optional) Use these SMs directly for compilation + # set(CMAKE_CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES_ALL}" CACHE STRING "" FORCE) + endif() string(APPEND CMAKE_CUDA_FLAGS " --use_fast_math") From 7a732d840a932fce9924f549d2388417db3c2113 Mon Sep 17 00:00:00 2001 From: Johnny Date: Sun, 24 Aug 2025 00:48:08 +0200 Subject: [PATCH 06/18] Update CMakeLists.txt --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3258c4ab4..7f401ae83 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -133,12 +133,12 @@ if(BUILD_CUDA) # Add new SMs introduced in CUDA 13 list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 103 110 121) - # Remove all SMs below 86 (including 80-series) + # Remove all SMs below 80 list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES_ALL - 50 52 53 60 61 62 70 72 75 80) + 50 52 53 60 61 62 70 72 75) # Remove majors below 90 - list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80) + list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70) # Optionally, keep majors as round numbers only in CUDA 13+ list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 90 100 120) From b3221c6df5c106fd5f51c96ecd8f02c042b5f75c Mon Sep 17 00:00:00 2001 From: Johnny Date: Sun, 24 Aug 2025 00:53:42 +0200 Subject: [PATCH 07/18] Update pythonInterface.cpp --- csrc/pythonInterface.cpp | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/csrc/pythonInterface.cpp b/csrc/pythonInterface.cpp index 9c4cab9cc..5e4dcb6a8 100644 --- a/csrc/pythonInterface.cpp +++ b/csrc/pythonInterface.cpp @@ -614,16 +614,40 @@ void* cget_managed_ptr(size_t bytes) { return ptr; } -void cprefetch(void* ptr, size_t bytes, int device) { +#include +#ifndef CUDART_VERSION +#define CUDART_VERSION 0 +#endif + +// Unified helper: CUDA13+ uses cudaMemLocation; older CUDA/HIP keeps int device +static inline cudaError_t bnb_prefetch_to(void* ptr, size_t bytes, int device, cudaStream_t stream) { +#if defined(BUILD_CUDA) && !defined(BUILD_HIP) && (CUDART_VERSION >= 13000) + cudaMemLocation loc{}; + if (device == cudaCpuDeviceId) { + loc.type = cudaMemLocationTypeHost; + loc.id = 0; + } else { + loc.type = cudaMemLocationTypeDevice; + loc.id = device; + } + return cudaMemPrefetchAsync(ptr, bytes, loc, stream); +#else + // Older CUDA or HIP path (your BUILD_HIP macro maps cudaMemPrefetchAsync -> hipMemPrefetchAsync) + return cudaMemPrefetchAsync(ptr, bytes, device, stream); +#endif +} - int hasPrefetch = 0; - CUDA_CHECK_RETURN( - cudaDeviceGetAttribute(&hasPrefetch, cudaDevAttrConcurrentManagedAccess, device) - ); // 40ns overhead - if (hasPrefetch == 0) - return; +void cprefetch(void* ptr, size_t bytes, int device) { + // Only check the device attribute when prefetching to a device + if (device != cudaCpuDeviceId) { + int hasPrefetch = 0; + CUDA_CHECK_RETURN(cudaDeviceGetAttribute( + &hasPrefetch, cudaDevAttrConcurrentManagedAccess, device)); // ~40ns + if (hasPrefetch == 0) + return; + } - CUDA_CHECK_RETURN(cudaMemPrefetchAsync(ptr, bytes, device, 0)); + CUDA_CHECK_RETURN(bnb_prefetch_to(ptr, bytes, device, /*stream=*/0)); CUDA_CHECK_RETURN(cudaPeekAtLastError()); } From 3e9dedbd0d0634c57155bccfbca6eca651e5856b Mon Sep 17 00:00:00 2001 From: Johnny Date: Sun, 24 Aug 2025 00:54:12 +0200 Subject: [PATCH 08/18] Update CMakeLists.txt --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7f401ae83..88d51921a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -113,8 +113,8 @@ if(BUILD_CUDA) message(STATUS "CMake < 3.23.0; determining CUDA architectures supported...") # Base (CUDA 11.4+) - set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80 86 87) - set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80) + set(CMAKE_CUDA_ARCHITECTURES_ALL 80 86 87) + set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 80) # CUDA 11.8 adds Ada/Hopper if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8") From 12a0d4377171fc69a8371dbd21609d08d1c91f71 Mon Sep 17 00:00:00 2001 From: Johnny Date: Sun, 24 Aug 2025 00:59:35 +0200 Subject: [PATCH 09/18] Update CMakeLists.txt --- CMakeLists.txt | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 88d51921a..674e8990f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -168,6 +168,19 @@ if(BUILD_CUDA) list(APPEND POSSIBLE_CAPABILITIES ${capability_id}) endif() endforeach() + + # CUDA 13+: drop <=80 and add new SMs + if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 103 110 121) + list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75) + list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70) + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 80 90 100 120) + endif() + + list(REMOVE_DUPLICATES CMAKE_CUDA_ARCHITECTURES_ALL) + list(REMOVE_DUPLICATES CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR) + list(SORT CMAKE_CUDA_ARCHITECTURES_ALL COMPARE NATURAL) + list(SORT CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR COMPARE NATURAL) # This can be changed via -D argument to CMake # By default all possible capabilities are compiled From 02188cc8159b2fd88fc6765cb29b3a9deb65591a Mon Sep 17 00:00:00 2001 From: Johnny Date: Sun, 24 Aug 2025 01:01:24 +0200 Subject: [PATCH 10/18] Update CMakeLists.txt --- CMakeLists.txt | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 674e8990f..a0812e793 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -109,7 +109,7 @@ if(BUILD_CUDA) endif() # For CMake < 3.23.0: manually determine supported CUDA architectures - if (CMAKE_VERSION VERSION_LESS "3.23.0") + if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.23.0") message(STATUS "CMake < 3.23.0; determining CUDA architectures supported...") # Base (CUDA 11.4+) @@ -161,27 +161,6 @@ if(BUILD_CUDA) string(APPEND CMAKE_CUDA_FLAGS " -Xptxas=-v") endif() - foreach(capability ${CMAKE_CUDA_ARCHITECTURES_ALL}) - # Most of the items here are like: `xx-real`, so we just extract the `xx` portion - string(REGEX MATCH "[0-9]+" capability_id "${capability}") - if(capability_id GREATER 0) - list(APPEND POSSIBLE_CAPABILITIES ${capability_id}) - endif() - endforeach() - - # CUDA 13+: drop <=80 and add new SMs - if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") - list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 103 110 121) - list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75) - list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70) - list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 80 90 100 120) - endif() - - list(REMOVE_DUPLICATES CMAKE_CUDA_ARCHITECTURES_ALL) - list(REMOVE_DUPLICATES CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR) - list(SORT CMAKE_CUDA_ARCHITECTURES_ALL COMPARE NATURAL) - list(SORT CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR COMPARE NATURAL) - # This can be changed via -D argument to CMake # By default all possible capabilities are compiled set(COMPUTE_CAPABILITY "${POSSIBLE_CAPABILITIES}" CACHE STRING "Compute Capabilities Targeted") From c6f7d276561a7888e49589450d268f1cd5b06b42 Mon Sep 17 00:00:00 2001 From: Johnny Date: Sun, 24 Aug 2025 01:03:04 +0200 Subject: [PATCH 11/18] Update CMakeLists.txt --- CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index a0812e793..d10d40eb3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -161,6 +161,14 @@ if(BUILD_CUDA) string(APPEND CMAKE_CUDA_FLAGS " -Xptxas=-v") endif() + foreach(capability ${CMAKE_CUDA_ARCHITECTURES_ALL}) + # Most of the items here are like: `xx-real`, so we just extract the `xx` portion + string(REGEX MATCH "[0-9]+" capability_id "${capability}") + if(capability_id GREATER 0) + list(APPEND POSSIBLE_CAPABILITIES ${capability_id}) + endif() + endforeach() + # This can be changed via -D argument to CMake # By default all possible capabilities are compiled set(COMPUTE_CAPABILITY "${POSSIBLE_CAPABILITIES}" CACHE STRING "Compute Capabilities Targeted") From 51885c9cff2b18617edc7c5181425ab970d06754 Mon Sep 17 00:00:00 2001 From: johnnynunez Date: Sun, 24 Aug 2025 01:07:48 +0200 Subject: [PATCH 12/18] Replace cub::Max() with cuda::maximum<> in kernel reductions --- csrc/kernels.cu | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 97b80f050..fc4734e54 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -416,7 +417,7 @@ __global__ void kQuantizeBlockwise( for (int j = 0; j < NUM_PER_TH; j++) local_abs_max = fmaxf(local_abs_max, fabsf((float)vals[j])); - local_abs_max = BlockReduce(reduce).Reduce(local_abs_max, cub::Max(), valid_items); + local_abs_max = BlockReduce(reduce).Reduce(local_abs_max, cuda::maximum<>{}, valid_items); if (threadIdx.x == 0) { smem_absmax_value[0] = 1.0f / local_abs_max; @@ -1002,9 +1003,9 @@ __global__ void __launch_bounds__(NUM_THREADS, 2) kPreconditionOptimizerStatic8b } __syncthreads(); - local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, cub::Max(), valid_items); + local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, cuda::maximum<>{}, valid_items); __syncthreads(); - local_max_s2 = BlockReduce(temp_storage.reduce).Reduce(local_max_s2, cub::Max(), valid_items); + local_max_s2 = BlockReduce(temp_storage.reduce).Reduce(local_max_s2, cuda::maximum<>{}, valid_items); if (unorm != NULL) { __syncthreads(); local_unorm = BlockReduce(temp_storage.reduce).Reduce(local_unorm, cub::Sum(), valid_items); @@ -1213,7 +1214,7 @@ __global__ void __launch_bounds__(NUM_THREADS, 2) kPreconditionOptimizerStatic8b } __syncthreads(); - local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, cub::Max(), valid_items); + local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, cuda::maximum<>{}, valid_items); if (threadIdx.x == 0) { atomicMax(&new_max1[0], local_max_s1); } @@ -1843,7 +1844,7 @@ __launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__ } // Reduce thread-local absmax across the block. - const TReduction row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, cub::Max(), cols); + const TReduction row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, cuda::maximum<>{}, cols); if (threadIdx.x == 0) { // Save our block's absmax to shared memory for the quantization step. rowStats[row_id] = smem_row_absmax = row_absmax; @@ -1898,7 +1899,7 @@ __launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__ // Reduce thread-local absmax across the block. // TODO: Consider algorithm BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY - const float row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, cub::Max(), cols); + const float row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, cuda::maximum<>{}, cols); if (threadIdx.x == 0) { // Save our block's absmax to shared memory for the quantization step. rowStats[row_id] = row_absmax; From 6dc9b511a956b2b231c56f4033f62d6e7012e521 Mon Sep 17 00:00:00 2001 From: johnnynunez Date: Sun, 24 Aug 2025 01:10:05 +0200 Subject: [PATCH 13/18] Replace cub::Max() with cuda::maximum<> in kernel reductions --- csrc/kernels.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index fc4734e54..c988e7c6a 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -1008,7 +1008,7 @@ __global__ void __launch_bounds__(NUM_THREADS, 2) kPreconditionOptimizerStatic8b local_max_s2 = BlockReduce(temp_storage.reduce).Reduce(local_max_s2, cuda::maximum<>{}, valid_items); if (unorm != NULL) { __syncthreads(); - local_unorm = BlockReduce(temp_storage.reduce).Reduce(local_unorm, cub::Sum(), valid_items); + local_unorm = BlockReduce(temp_storage.reduce).Reduce(local_unorm, cuda::std::plus<>{}, valid_items); } if (threadIdx.x == 0) { @@ -1220,7 +1220,7 @@ __global__ void __launch_bounds__(NUM_THREADS, 2) kPreconditionOptimizerStatic8b } if (unorm != NULL) { __syncthreads(); - local_unorm = BlockReduce(temp_storage.reduce).Reduce(local_unorm, cub::Sum(), valid_items); + local_unorm = BlockReduce(temp_storage.reduce).Reduce(local_unorm, cuda::std::plus<>{}, valid_items); if (threadIdx.x == 0) { atomicAdd(&unorm[0], local_unorm); } @@ -1525,11 +1525,11 @@ __launch_bounds__(256, 3) __global__ void kOptimizerStatic8bit2StateBlockwise( } // reduce: 2.51/1.60 -> 2.67/1.69 - new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, cub::Max()); - new_local_abs_max2 = BlockReduce2(reduce2).Reduce(new_local_abs_max2, cub::Max()); + new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, cuda::maximum<>{}); + new_local_abs_max2 = BlockReduce2(reduce2).Reduce(new_local_abs_max2, cuda::maximum<>{}); if (OPTIMIZER == ADEMAMIX) { - new_local_abs_max3 = BlockReduce3(reduce3).Reduce(new_local_abs_max3, cub::Max()); + new_local_abs_max3 = BlockReduce3(reduce3).Reduce(new_local_abs_max3, cuda::maximum<>{}); } if (threadIdx.x == 0) { @@ -1738,7 +1738,7 @@ __launch_bounds__(256, 3) __global__ void kOptimizerStatic8bit1StateBlockwise( } // reduce: 2.51/1.60 -> 2.67/1.69 - new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, cub::Max()); + new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, cuda::maximum<>{}); if (threadIdx.x == 0) smem_exchange1[0] = new_local_abs_max1; From 1c7f0e85d6004641f38d08f9f734da02f82eac0f Mon Sep 17 00:00:00 2001 From: johnnynunez Date: Sun, 24 Aug 2025 01:11:05 +0200 Subject: [PATCH 14/18] Replace cub::Max() with cuda::maximum<> in kernel reductions --- csrc/pythonInterface.cpp | 40 ++++++++-------------------------------- 1 file changed, 8 insertions(+), 32 deletions(-) diff --git a/csrc/pythonInterface.cpp b/csrc/pythonInterface.cpp index 5e4dcb6a8..9c4cab9cc 100644 --- a/csrc/pythonInterface.cpp +++ b/csrc/pythonInterface.cpp @@ -614,40 +614,16 @@ void* cget_managed_ptr(size_t bytes) { return ptr; } -#include -#ifndef CUDART_VERSION -#define CUDART_VERSION 0 -#endif - -// Unified helper: CUDA13+ uses cudaMemLocation; older CUDA/HIP keeps int device -static inline cudaError_t bnb_prefetch_to(void* ptr, size_t bytes, int device, cudaStream_t stream) { -#if defined(BUILD_CUDA) && !defined(BUILD_HIP) && (CUDART_VERSION >= 13000) - cudaMemLocation loc{}; - if (device == cudaCpuDeviceId) { - loc.type = cudaMemLocationTypeHost; - loc.id = 0; - } else { - loc.type = cudaMemLocationTypeDevice; - loc.id = device; - } - return cudaMemPrefetchAsync(ptr, bytes, loc, stream); -#else - // Older CUDA or HIP path (your BUILD_HIP macro maps cudaMemPrefetchAsync -> hipMemPrefetchAsync) - return cudaMemPrefetchAsync(ptr, bytes, device, stream); -#endif -} - void cprefetch(void* ptr, size_t bytes, int device) { - // Only check the device attribute when prefetching to a device - if (device != cudaCpuDeviceId) { - int hasPrefetch = 0; - CUDA_CHECK_RETURN(cudaDeviceGetAttribute( - &hasPrefetch, cudaDevAttrConcurrentManagedAccess, device)); // ~40ns - if (hasPrefetch == 0) - return; - } - CUDA_CHECK_RETURN(bnb_prefetch_to(ptr, bytes, device, /*stream=*/0)); + int hasPrefetch = 0; + CUDA_CHECK_RETURN( + cudaDeviceGetAttribute(&hasPrefetch, cudaDevAttrConcurrentManagedAccess, device) + ); // 40ns overhead + if (hasPrefetch == 0) + return; + + CUDA_CHECK_RETURN(cudaMemPrefetchAsync(ptr, bytes, device, 0)); CUDA_CHECK_RETURN(cudaPeekAtLastError()); } From 528299db237d7302fc1e6a64b8fd9149f7bf6ec0 Mon Sep 17 00:00:00 2001 From: johnnynunez Date: Sun, 24 Aug 2025 01:13:12 +0200 Subject: [PATCH 15/18] Replace cub::Max() with cuda::maximum<> in kernel reductions --- csrc/pythonInterface.cpp | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/csrc/pythonInterface.cpp b/csrc/pythonInterface.cpp index 9c4cab9cc..be2c6c5dc 100644 --- a/csrc/pythonInterface.cpp +++ b/csrc/pythonInterface.cpp @@ -5,6 +5,27 @@ #if BUILD_CUDA #include +#include + +#if CUDART_VERSION >= 13000 +static inline cudaError_t bnb_cudaMemPrefetchAsync(const void* ptr, + size_t bytes, + int device, + cudaStream_t stream) { + cudaMemLocation loc{}; + loc.type = cudaMemLocationTypeDevice; + loc.id = device; + // flags = 0 + return cudaMemPrefetchAsync(ptr, bytes, loc, 0u, stream); +} +#else +static inline cudaError_t bnb_cudaMemPrefetchAsync(const void* ptr, + size_t bytes, + int device, + cudaStream_t stream) { + return cudaMemPrefetchAsync(ptr, bytes, device, stream); +} +#endif #endif #if BUILD_HIP #include @@ -623,7 +644,7 @@ void cprefetch(void* ptr, size_t bytes, int device) { if (hasPrefetch == 0) return; - CUDA_CHECK_RETURN(cudaMemPrefetchAsync(ptr, bytes, device, 0)); + CUDA_CHECK_RETURN(bnb_cudaMemPrefetchAsync(ptr, bytes, device, 0)); CUDA_CHECK_RETURN(cudaPeekAtLastError()); } From 555c6df0c389b03478223b28d980795058b4da8e Mon Sep 17 00:00:00 2001 From: johnnynunez Date: Sun, 24 Aug 2025 01:19:56 +0200 Subject: [PATCH 16/18] Replace cub::Max() with cuda::maximum<> in kernel reductions --- CMakeLists.txt | 74 +++++++++++++++++++------------------------------- 1 file changed, 28 insertions(+), 46 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d10d40eb3..c10bb6e62 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,53 +106,35 @@ if(BUILD_CUDA) if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.4") message(FATAL_ERROR "CUDA Version < 11.4 is not supported") + elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") + message(FATAL_ERROR "CUDA Version > 12 is not supported") endif() - # For CMake < 3.23.0: manually determine supported CUDA architectures - if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.23.0") - message(STATUS "CMake < 3.23.0; determining CUDA architectures supported...") - - # Base (CUDA 11.4+) - set(CMAKE_CUDA_ARCHITECTURES_ALL 80 86 87) - set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 80) - - # CUDA 11.8 adds Ada/Hopper - if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8") - list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 89 90) - list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 90) - endif() - - # CUDA 12.8 adds Blackwell - if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.8") - list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 100 120) - list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 100 120) - endif() - - # CUDA 13: drop all architectures <= 80-series - if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") - # Add new SMs introduced in CUDA 13 - list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 103 110 121) - - # Remove all SMs below 80 - list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES_ALL - 50 52 53 60 61 62 70 72 75) - - # Remove majors below 90 - list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70) - - # Optionally, keep majors as round numbers only in CUDA 13+ - list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 90 100 120) - endif() - - # Cleanup: remove duplicates and sort naturally - list(REMOVE_DUPLICATES CMAKE_CUDA_ARCHITECTURES_ALL) - list(REMOVE_DUPLICATES CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR) - list(SORT CMAKE_CUDA_ARCHITECTURES_ALL COMPARE NATURAL) - list(SORT CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR COMPARE NATURAL) - - # (Optional) Use these SMs directly for compilation - # set(CMAKE_CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES_ALL}" CACHE STRING "" FORCE) - endif() + # CMake < 3.23.0 does not define CMAKE_CUDA_ARCHITECTURES_ALL. + if(CMAKE_VERSION VERSION_LESS "3.23.0") + message(STATUS "CMake < 3.23.0; determining CUDA architectures supported...") + + # 11.4+ supports these at a minimum. + if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "13.0") + set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80 86 87) + set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80) + else() + set(CMAKE_CUDA_ARCHITECTURES_ALL 75 80 86 87) + set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 70 80) + endif() + + # CUDA 11.8 adds support for Ada and Hopper. + if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8") + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 89 90) + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 90) + endif() + + # CUDA 12.8 adds support for Blackwell. + if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.8") + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 100 101 120) + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 100 120) + endif() + endif() string(APPEND CMAKE_CUDA_FLAGS " --use_fast_math") @@ -161,7 +143,7 @@ if(BUILD_CUDA) string(APPEND CMAKE_CUDA_FLAGS " -Xptxas=-v") endif() - foreach(capability ${CMAKE_CUDA_ARCHITECTURES_ALL}) + foreach(capability ${CMAKE_CUDA_ARCHITECTURES_ALL}) # Most of the items here are like: `xx-real`, so we just extract the `xx` portion string(REGEX MATCH "[0-9]+" capability_id "${capability}") if(capability_id GREATER 0) From 82f3a30fd283daa9859990a67414a9de6c7b1a11 Mon Sep 17 00:00:00 2001 From: Johnny Date: Wed, 17 Sep 2025 17:54:23 +0200 Subject: [PATCH 17/18] Update CMakeLists.txt --- CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c3aeccb05..261b561a1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -140,8 +140,12 @@ if(BUILD_CUDA) # CUDA 12.8 adds support for Blackwell. if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.8") - list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 100 101 120) + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 100 120) list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 100 120) + endif() + if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 110) + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 110) endif() endif() From 46f6e5ae7bbe0b24a808f1805cba5e9dc442c0ca Mon Sep 17 00:00:00 2001 From: Johnny Date: Wed, 17 Sep 2025 17:56:12 +0200 Subject: [PATCH 18/18] Update CMakeLists.txt --- CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 261b561a1..118e6312b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -115,8 +115,6 @@ if(BUILD_CUDA) if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.4") message(FATAL_ERROR "CUDA Version < 11.4 is not supported") - elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") - message(FATAL_ERROR "CUDA Version > 12 is not supported") endif() # CMake < 3.23.0 does not define CMAKE_CUDA_ARCHITECTURES_ALL.