ashvardanian · maifeeulasad · Jan 20, 2026 · Jan 20, 2026 · Jan 20, 2026 · Jan 20, 2026
diff --git a/README.md b/README.md
@@ -11,11 +11,13 @@ This project provides a pre-configured environment for such workflows...:
 3. including [CCCL](https://github.com/NVIDIA/cccl) libraries, like Thrust, and CUB, to simplify the code.
 
 As an example, the repository implements, tests, and benchmarks only 2 operations - array accumulation and matrix multiplication.
-The baseline Python + Numba implementations are placed in `starter_kit_baseline.py`, and the optimized CUDA nd OpenMP implementations are placed in `starter_kit.cu`.
+The baseline Python + Numba implementations are placed in `starter_kit_baseline.py`, and the optimized CUDA and OpenMP implementations are placed in `starter_kit.cu`.
 If no CUDA-capable device is found, the file will be treated as a CPU-only C++ implementation.
 If VSCode is used, the `tasks.json` file is configured with debuggers for both CPU and GPU code, both in Python and C++.
 The `.clang-format` is configured with LLVM base style, adjusted for wider screens, allowing 120 characters per line.
 
+**Multi-GPU Support**: The repository now includes multi-GPU implementations for both reduction and matrix multiplication operations, utilizing CUDA cooperative groups and efficient device partitioning strategies.
+
 ## Installation
 
 I'd recommend forking the repository for your own projects, but you can also clone it directly:
@@ -51,6 +53,35 @@ The project is designed to be as simple as possible, with the following workflow
 2. Implement your baseline algorithm in `starter_kit_baseline.py`.
 3. Implement your optimized algorithm in `starter_kit.cu`.
 
+## Multi-GPU Features
+
+The starter kit now includes multi-GPU implementations:
+
+- **Multi-GPU Reduction**: Partitions data across available GPUs, performs parallel reductions, and aggregates results
+- **Multi-GPU Matrix Multiplication**: Distributes matrix rows across GPUs using row-wise partitioning with peer-to-peer access when available
+- **Automatic Detection**: Falls back to single-GPU or CPU when multiple GPUs are not available
+- **Cooperative Groups**: Uses CUDA cooperative groups for efficient inter-block synchronization
+- **Tested & Benchmarked**: Comprehensive test suite and performance benchmarks included
+
+Usage:
+```python
+import numpy as np
+from starter_kit import reduce_cuda_multigpu, matmul_cuda_multigpu, get_cuda_device_count
+
+# Check available GPUs
+num_gpus = get_cuda_device_count()
+print(f"Available GPUs: {num_gpus}")
+
+# Multi-GPU reduction
+data = np.random.rand(1_000_000).astype(np.float32)
+result = reduce_cuda_multigpu(data)
+
+# Multi-GPU matrix multiplication
+a = np.random.rand(1024, 1024).astype(np.float32)
+b = np.random.rand(1024, 1024).astype(np.float32)
+c = matmul_cuda_multigpu(a, b, tile_size=16)
+```
+
 ## Reading Materials
 
 Beginner GPGPU:

diff --git a/bench.py b/bench.py
@@ -15,7 +15,16 @@
 import pytest
 
 from starter_kit_baseline import reduce as reduce_baseline, matmul as matmul_baseline
-from starter_kit import reduce_openmp, reduce_cuda, matmul_openmp, matmul_cuda, supports_cuda
+from starter_kit import (
+    reduce_openmp,
+    reduce_cuda,
+    reduce_cuda_multigpu,
+    matmul_openmp,
+    matmul_cuda,
+    matmul_cuda_multigpu,
+    supports_cuda,
+    get_cuda_device_count,
+)
 
 # Build lists of (name, kernel_function) for reduction and matrix multiplication.
 REDUCTION_KERNELS = [
@@ -24,13 +33,17 @@
 ]
 if supports_cuda():
     REDUCTION_KERNELS.append(("cuda", reduce_cuda))
+if get_cuda_device_count() > 1:
+    REDUCTION_KERNELS.append(("cuda_multigpu", reduce_cuda_multigpu))
 
 MATMUL_KERNELS = [
     ("baseline", matmul_baseline),
     ("openmp", matmul_openmp),
 ]
 if supports_cuda():
     MATMUL_KERNELS.append(("cuda", matmul_cuda))
+if get_cuda_device_count() > 1:
+    MATMUL_KERNELS.append(("cuda_multigpu", matmul_cuda_multigpu))
 
 
 @pytest.mark.parametrize("dtype", [np.float32, np.int32])

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools>=42", "wheel", "pybind11", "numpy"]
+requires = ["setuptools>=42", "wheel", "pybind11>=2.10,<2.13", "numpy"]
 build-backend = "setuptools.build_meta"
 
 [project]
@@ -13,7 +13,7 @@ authors = [
     { name = "Ash Vardanian", email = "1983160+ashvardanian@users.noreply.github.com" },
 ]
 urls = { Homepage = "https://github.com/ashvardanian/PyBindToGPUs" }
-dependencies = ["pybind11", "numpy", "numba"]
+dependencies = ["pybind11>=2.10,<2.13", "numpy", "numba"]
 
 [project.optional-dependencies]
 cpu = ["pytest", "pytest-repeat", "pytest-benchmark"]

diff --git a/setup.py b/setup.py
@@ -29,29 +29,33 @@ def build_extensions(self):
                 super().build_extension(ext)
 
     def build_cuda_extension(self, ext):
-        # Compile CUDA source files
+        # Compile everything with NVCC (both device and host code)
+        cuda_objects = []
+        other_objects = []
+
         for source in ext.sources:
             if source.endswith(".cu"):
+                # Use NVCC to compile everything (device + host code)
                 self.compile_cuda(source)
-
-        # Compile non-CUDA source files
-        objects = []
-        for source in ext.sources:
-            if not source.endswith(".cu"):
+                cuda_objects.append(os.path.join(self.build_temp, "starter_kit.o"))
+            else:
+                # Compile non-CUDA files with GCC
                 obj = self.compiler.compile(
                     [source],
                     output_dir=self.build_temp,
+                    include_dirs=ext.include_dirs,
                     extra_postargs=[
                         "-fPIC",
                         "-std=c++17",
                         "-fdiagnostics-color=always",
                     ],
                 )
-                objects.extend(obj)
+                other_objects.extend(obj)
 
         # Link all object files
+        all_objects = cuda_objects + other_objects
         self.compiler.link_shared_object(
-            objects + [os.path.join(self.build_temp, "starter_kit.o")],
+            all_objects,
             self.get_ext_fullpath(ext.name),
             libraries=ext.libraries,
             library_dirs=ext.library_dirs,
@@ -103,33 +107,71 @@ def build_gcc_extension(self, ext):
         )
 
     def compile_cuda(self, source):
-        # Compile CUDA source file using NVCC
+        # Compile CUDA device code only using NVCC
+        import subprocess
         ext = self.extensions[0]
         output_dir = self.build_temp
         os.makedirs(output_dir, exist_ok=True)
-        include_dirs = self.compiler.include_dirs + ext.include_dirs
-        include_dirs = " ".join(f"-I{dir}" for dir in include_dirs)
+
+        # Include all directories: CUDA headers, PyBind11, NumPy, Python, CCCL
+        # Filter to only existing directories
+        include_dirs = [d for d in ext.include_dirs if os.path.exists(d)]
+        print(f"\n{'='*70}")
+        print(f"Include Directories for NVCC:")
+        for d in include_dirs:
+            print(f"  - {d}")
+        if not include_dirs:
+            print("  * WARNING: No include directories found!")
+        print(f"{'='*70}\n")
+
+        cuda_include_dirs_str = " ".join(f"-I{dir}" for dir in include_dirs)
         output_file = os.path.join(output_dir, "starter_kit.o")
 
         # Let's try inferring the compute capability from the GPU
-        arch_code = "90"
+        arch_code = "75"  # Default to Turing (T4 GPU)
         try:
             import pycuda.driver as cuda
             import pycuda.autoinit
 
             device = cuda.Device(0)  # Get the default device
             major, minor = device.compute_capability()
             arch_code = f"{major}{minor}"
-        except ImportError:
-            pass
+            print(f"Detected GPU Compute Capability: {arch_code}")
+        except (ImportError, Exception) as e:
+            print(f"Could not detect GPU, using default arch {arch_code}: {e}")
 
+        # Compile both device and host code with nvcc (no -dc flag)
         cmd = (
             f"nvcc -c {source} -o {output_file} -std=c++17 "
             f"-gencode=arch=compute_{arch_code},code=sm_{arch_code} "
-            f"-Xcompiler -fPIC {include_dirs} -O3 -g"
+            f"--expt-relaxed-constexpr --expt-extended-lambda "
+            f"-D__CUDACC_RELAXED_CONSTEXPR__ "
+            f"-Xcompiler -fPIC,-Wno-psabi {cuda_include_dirs_str} -O3 -g"
         )
-        if os.system(cmd) != 0:
-            raise RuntimeError(f"nvcc compilation of {source} failed")
+
+        print(f"\n{'='*70}")
+        print(f"NVCC Command:")
+        print(f"{cmd}")
+        print(f"{'='*70}\n")
+
+        # Use subprocess to capture output
+        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+
+        if result.returncode != 0:
+            print(f"\n{'='*70}")
+            print(f"NVCC COMPILATION FAILED!")
+            print(f"{'='*70}")
+            print(f"STDOUT:\n{result.stdout}")
+            print(f"{'='*70}")
+            print(f"STDERR:\n{result.stderr}")
+            print(f"{'='*70}\n")
+            raise RuntimeError(f"nvcc compilation of {source} failed with exit code {result.returncode}")
+        else:
+            print(f"- NVCC compilation successful")
+            if result.stdout:
+                print(f"STDOUT: {result.stdout}")
+            if result.stderr:
+                print(f"STDERR: {result.stderr}")
 
 
 __version__ = open("VERSION", "r").read().strip()
@@ -165,7 +207,7 @@ def compile_cuda(self, source):
         ],
         #
         libraries=[python_lib_name.replace(".a", "")]
-        + (["cudart", "cuda", "cublas"] if enable_cuda else [])
+        + (["cudart", "cublas"] if enable_cuda else [])
         + (["gomp"] if enable_openmp else []),
         #
         extra_link_args=[f"-Wl,-rpath,{python_lib_dir}"]