Skip to content

Commit cddfd42

Browse files
committed
Fixed the issues of including Thrust and MIOpen
1 parent 4bdf569 commit cddfd42

File tree

13 files changed

+129
-69
lines changed

13 files changed

+129
-69
lines changed

docker/rocm/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# GPU Programming 101 - ROCm Development Container
22
# Based on AMD's official ROCm development image - used as-is for maximum compatibility
33

4-
FROM rocm/dev-ubuntu-24.04:latest
4+
FROM rocm/dev-ubuntu-24.04:7.0-complete
55

66
# Metadata
77
LABEL maintainer="GPU Programming 101"

modules/module1/examples/03_matrix_multiplication_hip.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,8 @@ __global__ void matrixMultiplyAMDOptimized(float *A, float *B, float *C, int N)
8080

8181
float sum = 0.0f;
8282

83-
// Unrolled tile loop for better instruction scheduling
84-
#pragma unroll 4
83+
// ROCm 7: Use clang loop optimization hints instead of fixed unroll count
84+
// The compiler will determine optimal unrolling based on target architecture
8585
for (int t = 0; t < (N + TILE_SIZE - 1) / TILE_SIZE; t++) {
8686
// Coalesced loads
8787
tileA[hipThreadIdx_y][hipThreadIdx_x] =

modules/module1/examples/Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ HIP_DEBUG_FLAGS = -std=c++17 -g
3333

3434
# ROCm 7 note: hipcc may require explicit --rocm-path when HIP runtime/version files
3535
# aren't in legacy locations. See ROCm docs on file structure reorg.
36-
# Prefer environment ROCM_PATH if set; fall back to /opt/rocm.
37-
ROCM_PATH ?= /opt/rocm
36+
# Prefer environment ROCM_PATH if set; prefer ROCm 7.0.0, fall back to any ROCm installation.
37+
ROCM_PATH ?= $(shell ls -d /opt/rocm-7.0.0 2>/dev/null || ls -d /opt/rocm* 2>/dev/null | head -1 || echo /opt/rocm)
3838
# Prefer hipconfig (ROCm 7) to locate ROCm root, then fall back to hipcc path.
3939
HIPCONFIG_BIN := $(shell command -v hipconfig 2>/dev/null)
4040
ifneq ($(HIPCONFIG_BIN),)

modules/module2/examples/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ HIP_FLAGS = -std=c++17 -O2
3232
HIP_DEBUG_FLAGS = -std=c++17 -g
3333

3434
# ROCm 7: Ensure hipcc can find HIP runtime by passing --rocm-path
35-
ROCM_PATH ?= /opt/rocm
35+
ROCM_PATH ?= $(shell ls -d /opt/rocm-7.0.0 2>/dev/null || ls -d /opt/rocm* 2>/dev/null | head -1 || echo /opt/rocm)
3636
# Auto-detect ROCm path from hipcc if headers not found
3737
ifeq ($(wildcard $(ROCM_PATH)/include/hip/hip_runtime.h),)
3838
HIPCC_BIN := $(shell command -v hipcc 2>/dev/null)

modules/module3/examples/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ HIP_FLAGS = -std=c++17 -O2
3232
HIP_DEBUG_FLAGS = -std=c++17 -g
3333

3434
# ROCm 7: Ensure hipcc can find HIP runtime by passing --rocm-path
35-
ROCM_PATH ?= /opt/rocm
35+
ROCM_PATH ?= $(shell ls -d /opt/rocm-7.0.0 2>/dev/null || ls -d /opt/rocm* 2>/dev/null | head -1 || echo /opt/rocm)
3636
# Auto-detect ROCm path from hipcc if headers not found
3737
ifeq ($(wildcard $(ROCM_PATH)/include/hip/hip_runtime.h),)
3838
HIPCC_BIN := $(shell command -v hipcc 2>/dev/null)

modules/module4/examples/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ HIP_FLAGS = -std=c++17 -O2 -fopenmp
3333
HIP_DEBUG_FLAGS = -std=c++17 -g -fopenmp
3434

3535
# ROCm 7: Ensure hipcc can find HIP runtime by passing --rocm-path
36-
ROCM_PATH ?= /opt/rocm
36+
ROCM_PATH ?= $(shell ls -d /opt/rocm-7.0.0 2>/dev/null || ls -d /opt/rocm* 2>/dev/null | head -1 || echo /opt/rocm)
3737
# Auto-detect ROCm path from hipcc if headers not found
3838
ifeq ($(wildcard $(ROCM_PATH)/include/hip/hip_runtime.h),)
3939
HIPCC_BIN := $(shell command -v hipcc 2>/dev/null)

modules/module5/examples/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ HIP_FLAGS = -std=c++17 -O3
3232
HIP_DEBUG_FLAGS = -std=c++17 -g
3333

3434
# ROCm 7: Ensure hipcc can find HIP runtime by passing --rocm-path
35-
ROCM_PATH ?= /opt/rocm
35+
ROCM_PATH ?= $(shell ls -d /opt/rocm-7.0.0 2>/dev/null || ls -d /opt/rocm* 2>/dev/null | head -1 || echo /opt/rocm)
3636
# Auto-detect ROCm path from hipcc if headers not found
3737
ifeq ($(wildcard $(ROCM_PATH)/include/hip/hip_runtime.h),)
3838
HIPCC_BIN := $(shell command -v hipcc 2>/dev/null)

modules/module6/examples/01_convolution_hip.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -296,7 +296,7 @@ __device__ float wavefront_reduce_sum(float val) {
296296
void initialize_data(float *data, int size, bool random = true) {
297297
for (int i = 0; i < size; i++) {
298298
if (random) {
299-
data[i] = static_cast<float>(rand()) / RAND_MAX;
299+
data[i] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
300300
} else {
301301
data[i] = 1.0f; // Unit impulse for testing
302302
}

modules/module6/examples/03_histogram_hip.cpp

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -96,18 +96,10 @@ __global__ void histogram_wavefront_aggregation(int* input, int* histogram, int
9696
for (int i = idx; i < n; i += blockDim.x * gridDim.x) {
9797
int bin = input[i] % num_bins;
9898

99-
// Use ballot and popcount for wavefront aggregation
100-
uint64_t mask = __ballot(1); // All active threads in wavefront
101-
int count = __popcll(mask);
102-
103-
// Count how many threads in wavefront want same bin
104-
uint64_t same_bin_mask = __ballot(bin == bin); // Simplified - would need proper comparison
105-
int same_bin_count = __popcll(same_bin_mask);
106-
107-
// Only first thread with this bin value updates
108-
if (__ffsll((unsigned long long)same_bin_mask) - 1 == lane) {
109-
atomicAdd(&lds_hist[bin], same_bin_count);
110-
}
99+
// Simple atomic increment - removing complex wavefront aggregation for clarity
100+
// In a production implementation, you would use more sophisticated wavefront
101+
// aggregation by comparing bin values across the wavefront
102+
atomicAdd(&lds_hist[bin], 1);
111103
}
112104

113105
__syncthreads();

modules/module6/examples/05_prefix_sum_hip.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ __global__ void blelloch_scan_exclusive(float* input, float* output, int n) {
182182
// LDS bank conflict free optimization for AMD GPUs
183183
#define NUM_BANKS 32
184184
#define LOG_NUM_BANKS 5
185-
#define CONFLICT_FREE_OFFSET(n) (((n) >> NUM_BANKS) + ((n) >> (2 * NUM_BANKS)))
185+
#define CONFLICT_FREE_OFFSET(n) (((n) >> LOG_NUM_BANKS) + ((n) >> (2 * LOG_NUM_BANKS)))
186186

187187
__global__ void blelloch_scan_lds_optimized(float* input, float* output, int n) {
188188
__shared__ float temp[512 + 512/NUM_BANKS]; // Extra space for conflict avoidance

0 commit comments

Comments
 (0)