Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 32 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,13 @@ This project provides a pre-configured environment for such workflows...:
3. including [CCCL](https://github.com/NVIDIA/cccl) libraries, like Thrust, and CUB, to simplify the code.

As an example, the repository implements, tests, and benchmarks only 2 operations - array accumulation and matrix multiplication.
The baseline Python + Numba implementations are placed in `starter_kit_baseline.py`, and the optimized CUDA nd OpenMP implementations are placed in `starter_kit.cu`.
The baseline Python + Numba implementations are placed in `starter_kit_baseline.py`, and the optimized CUDA and OpenMP implementations are placed in `starter_kit.cu`.
If no CUDA-capable device is found, the file will be treated as a CPU-only C++ implementation.
If VSCode is used, the `tasks.json` file is configured with debuggers for both CPU and GPU code, both in Python and C++.
The `.clang-format` is configured with LLVM base style, adjusted for wider screens, allowing 120 characters per line.

**Multi-GPU Support**: The repository now includes multi-GPU implementations for both reduction and matrix multiplication operations, utilizing CUDA cooperative groups and efficient device partitioning strategies.

## Installation

I'd recommend forking the repository for your own projects, but you can also clone it directly:
Expand Down Expand Up @@ -51,6 +53,35 @@ The project is designed to be as simple as possible, with the following workflow
2. Implement your baseline algorithm in `starter_kit_baseline.py`.
3. Implement your optimized algorithm in `starter_kit.cu`.

## Multi-GPU Features

The starter kit now includes multi-GPU implementations:

- **Multi-GPU Reduction**: Partitions data across available GPUs, performs parallel reductions, and aggregates results
- **Multi-GPU Matrix Multiplication**: Distributes matrix rows across GPUs using row-wise partitioning with peer-to-peer access when available
- **Automatic Detection**: Falls back to single-GPU or CPU when multiple GPUs are not available
- **Cooperative Groups**: Uses CUDA cooperative groups for efficient inter-block synchronization
- **Tested & Benchmarked**: Comprehensive test suite and performance benchmarks included

Usage:
```python
import numpy as np
from starter_kit import reduce_cuda_multigpu, matmul_cuda_multigpu, get_cuda_device_count

# Check available GPUs
num_gpus = get_cuda_device_count()
print(f"Available GPUs: {num_gpus}")

# Multi-GPU reduction
data = np.random.rand(1_000_000).astype(np.float32)
result = reduce_cuda_multigpu(data)

# Multi-GPU matrix multiplication
a = np.random.rand(1024, 1024).astype(np.float32)
b = np.random.rand(1024, 1024).astype(np.float32)
c = matmul_cuda_multigpu(a, b, tile_size=16)
```

## Reading Materials

Beginner GPGPU:
Expand Down
15 changes: 14 additions & 1 deletion bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,16 @@
import pytest

from starter_kit_baseline import reduce as reduce_baseline, matmul as matmul_baseline
from starter_kit import reduce_openmp, reduce_cuda, matmul_openmp, matmul_cuda, supports_cuda
from starter_kit import (
reduce_openmp,
reduce_cuda,
reduce_cuda_multigpu,
matmul_openmp,
matmul_cuda,
matmul_cuda_multigpu,
supports_cuda,
get_cuda_device_count,
)

# Build lists of (name, kernel_function) for reduction and matrix multiplication.
REDUCTION_KERNELS = [
Expand All @@ -24,13 +33,17 @@
]
if supports_cuda():
REDUCTION_KERNELS.append(("cuda", reduce_cuda))
if get_cuda_device_count() > 1:
REDUCTION_KERNELS.append(("cuda_multigpu", reduce_cuda_multigpu))

MATMUL_KERNELS = [
("baseline", matmul_baseline),
("openmp", matmul_openmp),
]
if supports_cuda():
MATMUL_KERNELS.append(("cuda", matmul_cuda))
if get_cuda_device_count() > 1:
MATMUL_KERNELS.append(("cuda_multigpu", matmul_cuda_multigpu))


@pytest.mark.parametrize("dtype", [np.float32, np.int32])
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[build-system]
requires = ["setuptools>=42", "wheel", "pybind11", "numpy"]
requires = ["setuptools>=42", "wheel", "pybind11>=2.10,<2.13", "numpy"]
build-backend = "setuptools.build_meta"

[project]
Expand All @@ -13,7 +13,7 @@ authors = [
{ name = "Ash Vardanian", email = "1983160+ashvardanian@users.noreply.github.com" },
]
urls = { Homepage = "https://github.com/ashvardanian/PyBindToGPUs" }
dependencies = ["pybind11", "numpy", "numba"]
dependencies = ["pybind11>=2.10,<2.13", "numpy", "numba"]

[project.optional-dependencies]
cpu = ["pytest", "pytest-repeat", "pytest-benchmark"]
Expand Down
78 changes: 60 additions & 18 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,29 +29,33 @@ def build_extensions(self):
super().build_extension(ext)

def build_cuda_extension(self, ext):
# Compile CUDA source files
# Compile everything with NVCC (both device and host code)
cuda_objects = []
other_objects = []

for source in ext.sources:
if source.endswith(".cu"):
# Use NVCC to compile everything (device + host code)
self.compile_cuda(source)

# Compile non-CUDA source files
objects = []
for source in ext.sources:
if not source.endswith(".cu"):
cuda_objects.append(os.path.join(self.build_temp, "starter_kit.o"))
else:
# Compile non-CUDA files with GCC
obj = self.compiler.compile(
[source],
output_dir=self.build_temp,
include_dirs=ext.include_dirs,
extra_postargs=[
"-fPIC",
"-std=c++17",
"-fdiagnostics-color=always",
],
)
objects.extend(obj)
other_objects.extend(obj)

# Link all object files
all_objects = cuda_objects + other_objects
self.compiler.link_shared_object(
objects + [os.path.join(self.build_temp, "starter_kit.o")],
all_objects,
self.get_ext_fullpath(ext.name),
libraries=ext.libraries,
library_dirs=ext.library_dirs,
Expand Down Expand Up @@ -103,33 +107,71 @@ def build_gcc_extension(self, ext):
)

def compile_cuda(self, source):
# Compile CUDA source file using NVCC
# Compile CUDA device code only using NVCC
import subprocess
ext = self.extensions[0]
output_dir = self.build_temp
os.makedirs(output_dir, exist_ok=True)
include_dirs = self.compiler.include_dirs + ext.include_dirs
include_dirs = " ".join(f"-I{dir}" for dir in include_dirs)

# Include all directories: CUDA headers, PyBind11, NumPy, Python, CCCL
# Filter to only existing directories
include_dirs = [d for d in ext.include_dirs if os.path.exists(d)]
print(f"\n{'='*70}")
print(f"Include Directories for NVCC:")
for d in include_dirs:
print(f" - {d}")
if not include_dirs:
print(" * WARNING: No include directories found!")
print(f"{'='*70}\n")

cuda_include_dirs_str = " ".join(f"-I{dir}" for dir in include_dirs)
output_file = os.path.join(output_dir, "starter_kit.o")

# Let's try inferring the compute capability from the GPU
arch_code = "90"
arch_code = "75" # Default to Turing (T4 GPU)
try:
import pycuda.driver as cuda
import pycuda.autoinit

device = cuda.Device(0) # Get the default device
major, minor = device.compute_capability()
arch_code = f"{major}{minor}"
except ImportError:
pass
print(f"Detected GPU Compute Capability: {arch_code}")
except (ImportError, Exception) as e:
print(f"Could not detect GPU, using default arch {arch_code}: {e}")

# Compile both device and host code with nvcc (no -dc flag)
cmd = (
f"nvcc -c {source} -o {output_file} -std=c++17 "
f"-gencode=arch=compute_{arch_code},code=sm_{arch_code} "
f"-Xcompiler -fPIC {include_dirs} -O3 -g"
f"--expt-relaxed-constexpr --expt-extended-lambda "
f"-D__CUDACC_RELAXED_CONSTEXPR__ "
f"-Xcompiler -fPIC,-Wno-psabi {cuda_include_dirs_str} -O3 -g"
)
if os.system(cmd) != 0:
raise RuntimeError(f"nvcc compilation of {source} failed")

print(f"\n{'='*70}")
print(f"NVCC Command:")
print(f"{cmd}")
print(f"{'='*70}\n")

# Use subprocess to capture output
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)

if result.returncode != 0:
print(f"\n{'='*70}")
print(f"NVCC COMPILATION FAILED!")
print(f"{'='*70}")
print(f"STDOUT:\n{result.stdout}")
print(f"{'='*70}")
print(f"STDERR:\n{result.stderr}")
print(f"{'='*70}\n")
raise RuntimeError(f"nvcc compilation of {source} failed with exit code {result.returncode}")
else:
print(f"- NVCC compilation successful")
if result.stdout:
print(f"STDOUT: {result.stdout}")
if result.stderr:
print(f"STDERR: {result.stderr}")


__version__ = open("VERSION", "r").read().strip()
Expand Down Expand Up @@ -165,7 +207,7 @@ def compile_cuda(self, source):
],
#
libraries=[python_lib_name.replace(".a", "")]
+ (["cudart", "cuda", "cublas"] if enable_cuda else [])
+ (["cudart", "cublas"] if enable_cuda else [])
+ (["gomp"] if enable_openmp else []),
#
extra_link_args=[f"-Wl,-rpath,{python_lib_dir}"]
Expand Down
Loading