From 331c1f13be92eab0c42d19d13d56fbfc680377b2 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 10 Dec 2025 14:03:24 +0100 Subject: [PATCH 01/12] Use the new cuda-python modules. --- kernel_tuner/backends/nvcuda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index 15259cb23..7adad2786 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -10,7 +10,7 @@ # embedded in try block to be able to generate documentation # and run tests without cuda-python installed try: - from cuda import cuda, cudart, nvrtc + from cuda.bindings import driver, runtime, nvrtc except ImportError: cuda = None From b5d92d2e25d6b538d4a0e80ffe440e7820618944 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 10 Dec 2025 14:14:52 +0100 Subject: [PATCH 02/12] Importing the correct modules, and using them. --- kernel_tuner/backends/nvcuda.py | 68 ++++++++++++++++----------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index 7adad2786..dfd2078a4 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -12,7 +12,7 @@ try: from cuda.bindings import driver, runtime, nvrtc except ImportError: - cuda = None + driver = None class CudaFunctions(GPUBackend): @@ -38,34 +38,34 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None """ self.allocations = [] self.texrefs = [] - if not cuda: + if not driver: raise ImportError( "cuda-python not installed, install using 'pip install cuda-python', or check https://kerneltuner.github.io/kernel_tuner/stable/install.html#cuda-and-pycuda." ) # initialize and select device - err = cuda.cuInit(0) + err = driver.cuInit(0) cuda_error_check(err) err, self.device = cuda.cuDeviceGet(device) cuda_error_check(err) - err, self.context = cuda.cuDevicePrimaryCtxRetain(device) + err, self.context = driver.cuDevicePrimaryCtxRetain(device) cuda_error_check(err) if CudaFunctions.last_selected_device != device: - err = cuda.cuCtxSetCurrent(self.context) + err = driver.cuCtxSetCurrent(self.context) cuda_error_check(err) CudaFunctions.last_selected_device = device # compute capabilities and device properties - err, major = cudart.cudaDeviceGetAttribute( - cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device + err, major = runtime.cudaDeviceGetAttribute( + runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device ) cuda_error_check(err) - err, minor = cudart.cudaDeviceGetAttribute( - cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device + err, minor = runtime.cudaDeviceGetAttribute( + runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device ) cuda_error_check(err) - err, self.max_threads = cudart.cudaDeviceGetAttribute( - cudart.cudaDeviceAttr.cudaDevAttrMaxThreadsPerBlock, device + err, self.max_threads = runtime.cudaDeviceGetAttribute( + runtime.cudaDeviceAttr.cudaDevAttrMaxThreadsPerBlock, device ) cuda_error_check(err) self.cc = f"{major}{minor}" @@ -78,11 +78,11 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.compiler_options_bytes.append(str(option).encode("UTF-8")) # create a stream and events - err, self.stream = cuda.cuStreamCreate(0) + err, self.stream = driver.cuStreamCreate(0) cuda_error_check(err) - err, self.start = cuda.cuEventCreate(0) + err, self.start = driver.cuEventCreate(0) cuda_error_check(err) - err, self.end = cuda.cuEventCreate(0) + err, self.end = driver.cuEventCreate(0) cuda_error_check(err) # default dynamically allocated shared memory size, can be overwritten using smem_args @@ -95,7 +95,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None observer.register_device(self) # collect environment information - err, device_properties = cudart.cudaGetDeviceProperties(device) + err, device_properties = runtime.cudaGetDeviceProperties(device) cuda_error_check(err) env = dict() env["device_name"] = device_properties.name.decode() @@ -109,8 +109,8 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None def __del__(self): for device_memory in self.allocations: - if isinstance(device_memory, cuda.CUdeviceptr): - err = cuda.cuMemFree(device_memory) + if isinstance(device_memory, driver.CUdeviceptr): + err = driver.cuMemFree(device_memory) cuda_error_check(err) def ready_argument_list(self, arguments): @@ -128,7 +128,7 @@ def ready_argument_list(self, arguments): for arg in arguments: # if arg is a numpy array copy it to device if isinstance(arg, np.ndarray): - err, device_memory = cuda.cuMemAlloc(arg.nbytes) + err, device_memory = driver.cuMemAlloc(arg.nbytes) cuda_error_check(err) self.allocations.append(device_memory) gpu_args.append(device_memory) @@ -184,18 +184,18 @@ def compile(self, kernel_instance): buff = b" " * size err = nvrtc.nvrtcGetPTX(program, buff) cuda_error_check(err) - err, self.current_module = cuda.cuModuleLoadData(np.char.array(buff)) - if err == cuda.CUresult.CUDA_ERROR_INVALID_PTX: + err, self.current_module = driver.cuModuleLoadData(np.char.array(buff)) + if err == driver.CUresult.CUDA_ERROR_INVALID_PTX: raise SkippableFailure("uses too much shared data") else: cuda_error_check(err) - err, self.func = cuda.cuModuleGetFunction( + err, self.func = driver.cuModuleGetFunction( self.current_module, str.encode(kernel_name) ) cuda_error_check(err) # get the number of registers per thread used in this kernel - num_regs = cuda.cuFuncGetAttribute(cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NUM_REGS, self.func) + num_regs = driver.cuFuncGetAttribute(driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NUM_REGS, self.func) assert num_regs[0] == 0, f"Retrieving number of registers per thread unsuccesful: code {num_regs[0]}" self.num_regs = num_regs[1] @@ -210,18 +210,18 @@ def compile(self, kernel_instance): def start_event(self): """Records the event that marks the start of a measurement.""" - err = cudart.cudaEventRecord(self.start, self.stream) + err = runtime.cudaEventRecord(self.start, self.stream) cuda_error_check(err) def stop_event(self): """Records the event that marks the end of a measurement.""" - err = cudart.cudaEventRecord(self.end, self.stream) + err = runtime.cudaEventRecord(self.end, self.stream) cuda_error_check(err) def kernel_finished(self): """Returns True if the kernel has finished, False otherwise.""" - err = cudart.cudaEventQuery(self.end) - if err[0] == cudart.cudaError_t.cudaSuccess: + err = runtime.cudaEventQuery(self.end) + if err[0] == runtime.cudaError_t.cudaSuccess: return True else: return False @@ -229,7 +229,7 @@ def kernel_finished(self): @staticmethod def synchronize(): """Halts execution until device has finished its tasks.""" - err = cudart.cudaDeviceSynchronize() + err = runtime.cudaDeviceSynchronize() cuda_error_check(err) def copy_constant_memory_args(self, cmem_args): @@ -243,9 +243,9 @@ def copy_constant_memory_args(self, cmem_args): :type cmem_args: dict( string: numpy.ndarray, ... ) """ for k, v in cmem_args.items(): - err, symbol, _ = cuda.cuModuleGetGlobal(self.current_module, str.encode(k)) + err, symbol, _ = driver.cuModuleGetGlobal(self.current_module, str.encode(k)) cuda_error_check(err) - err = cuda.cuMemcpyHtoD(symbol, v, v.nbytes) + err = driver.cuMemcpyHtoD(symbol, v, v.nbytes) cuda_error_check(err) def copy_shared_memory_args(self, smem_args): @@ -284,12 +284,12 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None): stream = self.stream arg_types = list() for arg in gpu_args: - if isinstance(arg, cuda.CUdeviceptr): + if isinstance(arg, driver.CUdeviceptr): arg_types.append(None) else: arg_types.append(np.ctypeslib.as_ctypes_type(arg.dtype)) kernel_args = (tuple(gpu_args), tuple(arg_types)) - err = cuda.cuLaunchKernel( + err = driver.cuLaunchKernel( func, grid[0], grid[1], @@ -318,7 +318,7 @@ def memset(allocation, value, size): :type size: int """ - err = cudart.cudaMemset(allocation, value, size) + err = runtime.cudaMemset(allocation, value, size) cuda_error_check(err) @staticmethod @@ -331,7 +331,7 @@ def memcpy_dtoh(dest, src): :param src: A GPU memory allocation unit :type src: cuda.CUdeviceptr """ - err = cuda.cuMemcpyDtoH(dest, src, dest.nbytes) + err = driver.cuMemcpyDtoH(dest, src, dest.nbytes) cuda_error_check(err) @staticmethod @@ -344,7 +344,7 @@ def memcpy_htod(dest, src): :param src: A numpy array in host memory to store the data :type src: numpy.ndarray """ - err = cuda.cuMemcpyHtoD(dest, src, src.nbytes) + err = driver.cuMemcpyHtoD(dest, src, src.nbytes) cuda_error_check(err) units = {"time": "ms"} From 881db4dbfedd86c049627caedf05f474769213b8 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 10 Dec 2025 14:26:09 +0100 Subject: [PATCH 03/12] Split the cuda-python utils from the main file. --- kernel_tuner/backends/nvcuda.py | 27 ++++++++------------------- kernel_tuner/util.py | 28 ---------------------------- kernel_tuner/utils/nvcuda.py | 31 +++++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 47 deletions(-) create mode 100644 kernel_tuner/utils/nvcuda.py diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index dfd2078a4..7c43528a5 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -5,7 +5,8 @@ from kernel_tuner.backends.backend import GPUBackend from kernel_tuner.observers.nvcuda import CudaRuntimeObserver -from kernel_tuner.util import SkippableFailure, cuda_error_check, to_valid_nvrtc_gpu_arch_cc +from kernel_tuner.util import SkippableFailure +from kernel_tuner.utils.nvcuda import cuda_error_check, to_valid_nvrtc_gpu_arch_cc # embedded in try block to be able to generate documentation # and run tests without cuda-python installed @@ -56,13 +57,9 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None CudaFunctions.last_selected_device = device # compute capabilities and device properties - err, major = runtime.cudaDeviceGetAttribute( - runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device - ) + err, major = runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device) cuda_error_check(err) - err, minor = runtime.cudaDeviceGetAttribute( - runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device - ) + err, minor = runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device) cuda_error_check(err) err, self.max_threads = runtime.cudaDeviceGetAttribute( runtime.cudaDeviceAttr.cudaDevAttrMaxThreadsPerBlock, device @@ -164,20 +161,14 @@ def compile(self, kernel_instance): if not any(["--std=" in opt for opt in self.compiler_options]): self.compiler_options.append("--std=c++11") if not any([b"--gpu-architecture=" in opt or b"-arch" in opt for opt in compiler_options]): - compiler_options.append( - f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8") - ) + compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8")) if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in self.compiler_options]): self.compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}") - err, program = nvrtc.nvrtcCreateProgram( - str.encode(kernel_string), b"CUDAProgram", 0, [], [] - ) + err, program = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b"CUDAProgram", 0, [], []) try: cuda_error_check(err) - err = nvrtc.nvrtcCompileProgram( - program, len(compiler_options), compiler_options - ) + err = nvrtc.nvrtcCompileProgram(program, len(compiler_options), compiler_options) cuda_error_check(err) err, size = nvrtc.nvrtcGetPTXSize(program) cuda_error_check(err) @@ -189,9 +180,7 @@ def compile(self, kernel_instance): raise SkippableFailure("uses too much shared data") else: cuda_error_check(err) - err, self.func = driver.cuModuleGetFunction( - self.current_module, str.encode(kernel_name) - ) + err, self.func = driver.cuModuleGetFunction(self.current_module, str.encode(kernel_name)) cuda_error_check(err) # get the number of registers per thread used in this kernel diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 2d9e3f1b3..2c50bd6cc 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -38,10 +38,6 @@ import cupy as cp except ImportError: cp = np -try: - from cuda import cuda, cudart, nvrtc -except ImportError: - cuda = None from kernel_tuner.observers.nvml import NVMLObserver @@ -642,14 +638,6 @@ def get_total_timings(results, env, overhead_time): return env -NVRTC_VALID_CC = np.array(["50", "52", "53", "60", "61", "62", "70", "72", "75", "80", "87", "89", "90", "90a"]) - - -def to_valid_nvrtc_gpu_arch_cc(compute_capability: str) -> str: - """Returns a valid Compute Capability for NVRTC `--gpu-architecture=`, as per https://docs.nvidia.com/cuda/nvrtc/index.html#group__options.""" - return max(NVRTC_VALID_CC[NVRTC_VALID_CC <= compute_capability], default="52") - - def print_config(config, tuning_options, runner): """Print the configuration string with tunable parameters and benchmark results.""" print_config_output(tuning_options.tune_params, config, runner.quiet, tuning_options.metrics, runner.units) @@ -1315,19 +1303,3 @@ def dump_cache(obj: str, tuning_options): if isinstance(tuning_options.cache, dict) and tuning_options.cachefile: with open(tuning_options.cachefile, "a") as cachefile: cachefile.write(obj) - - -def cuda_error_check(error): - """Checking the status of CUDA calls using the NVIDIA cuda-python backend.""" - if isinstance(error, cuda.CUresult): - if error != cuda.CUresult.CUDA_SUCCESS: - _, name = cuda.cuGetErrorName(error) - raise RuntimeError(f"CUDA error: {name.decode()}") - elif isinstance(error, cudart.cudaError_t): - if error != cudart.cudaError_t.cudaSuccess: - _, name = cudart.getErrorName(error) - raise RuntimeError(f"CUDART error: {name.decode()}") - elif isinstance(error, nvrtc.nvrtcResult): - if error != nvrtc.nvrtcResult.NVRTC_SUCCESS: - _, desc = nvrtc.nvrtcGetErrorString(error) - raise RuntimeError(f"NVRTC error: {desc.decode()}") diff --git a/kernel_tuner/utils/nvcuda.py b/kernel_tuner/utils/nvcuda.py new file mode 100644 index 000000000..72a0838ec --- /dev/null +++ b/kernel_tuner/utils/nvcuda.py @@ -0,0 +1,31 @@ +"""Module for kernel tuner cuda-python utility functions.""" + +import numpy as np + +try: + from cuda.bindings import driver, runtime, nvrtc +except ImportError: + cuda = None + +NVRTC_VALID_CC = np.array(["50", "52", "53", "60", "61", "62", "70", "72", "75", "80", "87", "89", "90", "90a"]) + + +def cuda_error_check(error): + """Checking the status of CUDA calls using the NVIDIA cuda-python backend.""" + if isinstance(error, driver.CUresult): + if error != driver.CUresult.CUDA_SUCCESS: + _, name = driver.cuGetErrorName(error) + raise RuntimeError(f"CUDA error: {name.decode()}") + elif isinstance(error, runtime.cudaError_t): + if error != runtime.cudaError_t.cudaSuccess: + _, name = runtime.getErrorName(error) + raise RuntimeError(f"CUDART error: {name.decode()}") + elif isinstance(error, nvrtc.nvrtcResult): + if error != nvrtc.nvrtcResult.NVRTC_SUCCESS: + _, desc = nvrtc.nvrtcGetErrorString(error) + raise RuntimeError(f"NVRTC error: {desc.decode()}") + + +def to_valid_nvrtc_gpu_arch_cc(compute_capability: str) -> str: + """Returns a valid Compute Capability for NVRTC `--gpu-architecture=`, as per https://docs.nvidia.com/cuda/nvrtc/index.html#group__options.""" + return max(NVRTC_VALID_CC[NVRTC_VALID_CC <= compute_capability], default="52") From e9e87d3e3265b35270fb6ec9ee49097e913e010b Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 10 Dec 2025 14:29:27 +0100 Subject: [PATCH 04/12] Modify the import on the observer. --- kernel_tuner/backends/nvcuda.py | 4 ++-- kernel_tuner/observers/nvcuda.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index 7c43528a5..e84ce6765 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -13,7 +13,7 @@ try: from cuda.bindings import driver, runtime, nvrtc except ImportError: - driver = None + cuda = None class CudaFunctions(GPUBackend): @@ -39,7 +39,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None """ self.allocations = [] self.texrefs = [] - if not driver: + if not cuda: raise ImportError( "cuda-python not installed, install using 'pip install cuda-python', or check https://kerneltuner.github.io/kernel_tuner/stable/install.html#cuda-and-pycuda." ) diff --git a/kernel_tuner/observers/nvcuda.py b/kernel_tuner/observers/nvcuda.py index c0a33ad5c..4e5caf72a 100644 --- a/kernel_tuner/observers/nvcuda.py +++ b/kernel_tuner/observers/nvcuda.py @@ -1,12 +1,12 @@ import numpy as np try: - from cuda import cudart + from cuda.bindings import cudart except ImportError: cuda = None from kernel_tuner.observers.observer import BenchmarkObserver -from kernel_tuner.util import cuda_error_check +from kernel_tuner.utils.nvcuda import cuda_error_check class CudaRuntimeObserver(BenchmarkObserver): From 31f24659371cb6d6d5dfa29a059524f08cfed8fa Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 10 Dec 2025 14:39:00 +0100 Subject: [PATCH 05/12] Fixed the tests. --- kernel_tuner/backends/nvcuda.py | 4 ++-- kernel_tuner/observers/nvcuda.py | 4 ++-- test/test_cuda_functions.py | 6 +++--- test/test_util_functions.py | 10 ---------- test/utils/nvcuda.py | 11 +++++++++++ 5 files changed, 18 insertions(+), 17 deletions(-) create mode 100644 test/utils/nvcuda.py diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index e84ce6765..7c43528a5 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -13,7 +13,7 @@ try: from cuda.bindings import driver, runtime, nvrtc except ImportError: - cuda = None + driver = None class CudaFunctions(GPUBackend): @@ -39,7 +39,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None """ self.allocations = [] self.texrefs = [] - if not cuda: + if not driver: raise ImportError( "cuda-python not installed, install using 'pip install cuda-python', or check https://kerneltuner.github.io/kernel_tuner/stable/install.html#cuda-and-pycuda." ) diff --git a/kernel_tuner/observers/nvcuda.py b/kernel_tuner/observers/nvcuda.py index 4e5caf72a..13fd4587b 100644 --- a/kernel_tuner/observers/nvcuda.py +++ b/kernel_tuner/observers/nvcuda.py @@ -1,7 +1,7 @@ import numpy as np try: - from cuda.bindings import cudart + from cuda.bindings import runtime except ImportError: cuda = None @@ -21,7 +21,7 @@ def __init__(self, dev): def after_finish(self): # Time is measured in milliseconds - err, time = cudart.cudaEventElapsedTime(self.start, self.end) + err, time = runtime.cudaEventElapsedTime(self.start, self.end) cuda_error_check(err) self.times.append(time) diff --git a/test/test_cuda_functions.py b/test/test_cuda_functions.py index 1dc68652d..3b1da7c5c 100644 --- a/test/test_cuda_functions.py +++ b/test/test_cuda_functions.py @@ -9,7 +9,7 @@ from .test_runners import env # noqa: F401 try: - from cuda import cuda + from cuda.bindings import driver except Exception: pass @@ -27,9 +27,9 @@ def test_ready_argument_list(): dev = nvcuda.CudaFunctions(0) gpu_args = dev.ready_argument_list(arguments) - assert isinstance(gpu_args[0], cuda.CUdeviceptr) + assert isinstance(gpu_args[0], driver.CUdeviceptr) assert isinstance(gpu_args[1], np.int32) - assert isinstance(gpu_args[2], cuda.CUdeviceptr) + assert isinstance(gpu_args[2], driver.CUdeviceptr) @skip_if_no_cuda diff --git a/test/test_util_functions.py b/test/test_util_functions.py index 4a1858f37..e785f415d 100644 --- a/test/test_util_functions.py +++ b/test/test_util_functions.py @@ -162,16 +162,6 @@ def test_get_thread_block_dimensions(): assert threads[2] == 1 -def test_to_valid_nvrtc_gpu_arch_cc(): - assert to_valid_nvrtc_gpu_arch_cc("89") == "89" - assert to_valid_nvrtc_gpu_arch_cc("88") == "87" - assert to_valid_nvrtc_gpu_arch_cc("86") == "80" - assert to_valid_nvrtc_gpu_arch_cc("40") == "52" - assert to_valid_nvrtc_gpu_arch_cc("90b") == "90a" - assert to_valid_nvrtc_gpu_arch_cc("91c") == "90a" - assert to_valid_nvrtc_gpu_arch_cc("1234") == "52" - - def test_prepare_kernel_string(): kernel = "this is a weird kernel" grid = (3, 7) diff --git a/test/utils/nvcuda.py b/test/utils/nvcuda.py new file mode 100644 index 000000000..4d05440cf --- /dev/null +++ b/test/utils/nvcuda.py @@ -0,0 +1,11 @@ +from kernel_tuner.utils.nvcuda import to_valid_nvrtc_gpu_arch_cc + + +def test_to_valid_nvrtc_gpu_arch_cc(): + assert to_valid_nvrtc_gpu_arch_cc("89") == "89" + assert to_valid_nvrtc_gpu_arch_cc("88") == "87" + assert to_valid_nvrtc_gpu_arch_cc("86") == "80" + assert to_valid_nvrtc_gpu_arch_cc("40") == "52" + assert to_valid_nvrtc_gpu_arch_cc("90b") == "90a" + assert to_valid_nvrtc_gpu_arch_cc("91c") == "90a" + assert to_valid_nvrtc_gpu_arch_cc("1234") == "52" From cb1a1b21532d9ffefd8d2b1aee50a69b8f1bd599 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 10 Dec 2025 14:42:30 +0100 Subject: [PATCH 06/12] Found a missed instance of cuda. --- kernel_tuner/backends/nvcuda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index 7c43528a5..000c7876b 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -47,7 +47,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None # initialize and select device err = driver.cuInit(0) cuda_error_check(err) - err, self.device = cuda.cuDeviceGet(device) + err, self.device = driver.cuDeviceGet(device) cuda_error_check(err) err, self.context = driver.cuDevicePrimaryCtxRetain(device) cuda_error_check(err) From 476ff94edf945758e76c7cb4a53cefd22255f4d9 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 10 Dec 2025 14:43:22 +0100 Subject: [PATCH 07/12] Typo in comment. --- kernel_tuner/backends/nvcuda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index 000c7876b..5f49c8d8f 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -17,7 +17,7 @@ class CudaFunctions(GPUBackend): - """Class that groups the Cuda functions on maintains state about the device.""" + """Class that groups the Cuda functions and it maintains state about the device.""" def __init__(self, device=0, iterations=7, compiler_options=None, observers=None): """Instantiate CudaFunctions object used for interacting with the CUDA device. From 737c1f822113633cbc95fe586ef0bc18ab686da1 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 10 Dec 2025 14:48:32 +0100 Subject: [PATCH 08/12] Correct API. --- kernel_tuner/utils/nvcuda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/utils/nvcuda.py b/kernel_tuner/utils/nvcuda.py index 72a0838ec..5fc38b1e1 100644 --- a/kernel_tuner/utils/nvcuda.py +++ b/kernel_tuner/utils/nvcuda.py @@ -18,7 +18,7 @@ def cuda_error_check(error): raise RuntimeError(f"CUDA error: {name.decode()}") elif isinstance(error, runtime.cudaError_t): if error != runtime.cudaError_t.cudaSuccess: - _, name = runtime.getErrorName(error) + _, name = runtime.cudaGetErrorName(error) raise RuntimeError(f"CUDART error: {name.decode()}") elif isinstance(error, nvrtc.nvrtcResult): if error != nvrtc.nvrtcResult.NVRTC_SUCCESS: From 7ca71ba9b97b4cd25578ef3847189b87729b0459 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 10 Dec 2025 14:53:50 +0100 Subject: [PATCH 09/12] Fixed the CUDA_VERSION variable. --- kernel_tuner/backends/nvcuda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index 5f49c8d8f..661aa466c 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -96,7 +96,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None cuda_error_check(err) env = dict() env["device_name"] = device_properties.name.decode() - env["cuda_version"] = cuda.CUDA_VERSION + env["cuda_version"] = driver.CUDA_VERSION env["compute_capability"] = self.cc env["iterations"] = self.iterations env["compiler_options"] = self.compiler_options From 6ed05e65c597ec1418ac95d0aabeae5d0baaab2b Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 10 Dec 2025 15:00:33 +0100 Subject: [PATCH 10/12] Change the error message. --- kernel_tuner/utils/nvcuda.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/utils/nvcuda.py b/kernel_tuner/utils/nvcuda.py index 5fc38b1e1..f86f5dd58 100644 --- a/kernel_tuner/utils/nvcuda.py +++ b/kernel_tuner/utils/nvcuda.py @@ -15,11 +15,11 @@ def cuda_error_check(error): if isinstance(error, driver.CUresult): if error != driver.CUresult.CUDA_SUCCESS: _, name = driver.cuGetErrorName(error) - raise RuntimeError(f"CUDA error: {name.decode()}") + raise RuntimeError(f"CUDA Driver error: {name.decode()}") elif isinstance(error, runtime.cudaError_t): if error != runtime.cudaError_t.cudaSuccess: _, name = runtime.cudaGetErrorName(error) - raise RuntimeError(f"CUDART error: {name.decode()}") + raise RuntimeError(f"CUDA Runtime error: {name.decode()}") elif isinstance(error, nvrtc.nvrtcResult): if error != nvrtc.nvrtcResult.NVRTC_SUCCESS: _, desc = nvrtc.nvrtcGetErrorString(error) From 91a01c07a6f94715361a258a286bdffceb1449a4 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 10 Dec 2025 15:04:16 +0100 Subject: [PATCH 11/12] Update the valid compute capabilities. --- kernel_tuner/utils/nvcuda.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/utils/nvcuda.py b/kernel_tuner/utils/nvcuda.py index f86f5dd58..f8040c4bc 100644 --- a/kernel_tuner/utils/nvcuda.py +++ b/kernel_tuner/utils/nvcuda.py @@ -7,7 +7,7 @@ except ImportError: cuda = None -NVRTC_VALID_CC = np.array(["50", "52", "53", "60", "61", "62", "70", "72", "75", "80", "87", "89", "90", "90a"]) +NVRTC_VALID_CC = np.array(["50", "52", "53", "60", "61", "62", "70", "72", "75", "80", "87", "89", "90", "90a", "100", "100f", "100a", "101", "101f", "101a", "103", "103f", "103a", "120", "120f", "120a", "121", "121f", "121a"]) def cuda_error_check(error): @@ -28,4 +28,4 @@ def cuda_error_check(error): def to_valid_nvrtc_gpu_arch_cc(compute_capability: str) -> str: """Returns a valid Compute Capability for NVRTC `--gpu-architecture=`, as per https://docs.nvidia.com/cuda/nvrtc/index.html#group__options.""" - return max(NVRTC_VALID_CC[NVRTC_VALID_CC <= compute_capability], default="52") + return max(NVRTC_VALID_CC[NVRTC_VALID_CC <= compute_capability], default="75") From 2d05fc545edd6a23cee4ec2545c68c1789c57989 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 10 Dec 2025 15:04:51 +0100 Subject: [PATCH 12/12] Formatting. --- kernel_tuner/utils/nvcuda.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/kernel_tuner/utils/nvcuda.py b/kernel_tuner/utils/nvcuda.py index f8040c4bc..5bbd7b5ea 100644 --- a/kernel_tuner/utils/nvcuda.py +++ b/kernel_tuner/utils/nvcuda.py @@ -7,7 +7,39 @@ except ImportError: cuda = None -NVRTC_VALID_CC = np.array(["50", "52", "53", "60", "61", "62", "70", "72", "75", "80", "87", "89", "90", "90a", "100", "100f", "100a", "101", "101f", "101a", "103", "103f", "103a", "120", "120f", "120a", "121", "121f", "121a"]) +NVRTC_VALID_CC = np.array( + [ + "50", + "52", + "53", + "60", + "61", + "62", + "70", + "72", + "75", + "80", + "87", + "89", + "90", + "90a", + "100", + "100f", + "100a", + "101", + "101f", + "101a", + "103", + "103f", + "103a", + "120", + "120f", + "120a", + "121", + "121f", + "121a", + ] +) def cuda_error_check(error):