From 740e7668dbc5439cc56f4fe67b2ed14ac18025a2 Mon Sep 17 00:00:00 2001 From: Patrick Stotko Date: Sun, 14 Dec 2025 16:07:29 +0100 Subject: [PATCH] examples: Add more device-specific examples to HIP backend --- examples/hip/CMakeLists.txt | 6 ++ examples/hip/bitset.hip | 118 ++++++++++++++++++++++++++++++ examples/hip/deque.hip | 110 ++++++++++++++++++++++++++++ examples/hip/mutex_array.hip | 118 ++++++++++++++++++++++++++++++ examples/hip/unordered_map.hip | 128 +++++++++++++++++++++++++++++++++ examples/hip/unordered_set.hip | 107 +++++++++++++++++++++++++++ examples/hip/vector.hip | 92 ++++++++++++++++++++++++ 7 files changed, 679 insertions(+) create mode 100644 examples/hip/bitset.hip create mode 100644 examples/hip/deque.hip create mode 100644 examples/hip/mutex_array.hip create mode 100644 examples/hip/unordered_map.hip create mode 100644 examples/hip/unordered_set.hip create mode 100644 examples/hip/vector.hip diff --git a/examples/hip/CMakeLists.txt b/examples/hip/CMakeLists.txt index 60a4ad81c..77d9d3de1 100644 --- a/examples/hip/CMakeLists.txt +++ b/examples/hip/CMakeLists.txt @@ -4,5 +4,11 @@ macro(stdgpu_add_example_hip) endmacro() stdgpu_add_example_hip(atomic) +stdgpu_add_example_hip(bitset) +stdgpu_add_example_hip(deque) stdgpu_add_example_hip(iterator) +stdgpu_add_example_hip(mutex_array) stdgpu_add_example_hip(ranges) +stdgpu_add_example_hip(unordered_map) +stdgpu_add_example_hip(unordered_set) +stdgpu_add_example_hip(vector) diff --git a/examples/hip/bitset.hip b/examples/hip/bitset.hip new file mode 100644 index 000000000..309190d58 --- /dev/null +++ b/examples/hip/bitset.hip @@ -0,0 +1,118 @@ +/* + * Copyright 2025 Patrick Stotko + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include // stdgpu::atomic +#include // stdgpu::bitset +#include // device_begin, device_end +#include // createDeviceArray, destroyDeviceArray +#include // STDGPU_HOST_DEVICE + +void +hip_check_error(hipError_t status, const char* file, int line) +{ + if (status != hipSuccess) + { + std::cerr << "HIP Error: " << hipGetErrorString(status) << " at " << file << ":" << line << std::endl; + } +} + +#define HIP_CHECK(...) hip_check_error((__VA_ARGS__), __FILE__, __LINE__) + +struct is_odd +{ + STDGPU_HOST_DEVICE bool + operator()(const int x) const + { + return x % 2 == 1; + } +}; + +__global__ void +set_bits(const int* d_result, const stdgpu::index_t d_result_size, stdgpu::bitset<> bits, stdgpu::atomic counter) +{ + stdgpu::index_t i = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + + if (i >= d_result_size) + return; + + bool was_set = bits.set(d_result[i]); + + if (!was_set) + { + ++counter; + } +} + +int +main() +{ + // + // EXAMPLE DESCRIPTION + // ------------------- + // This example shows how every second bit of stdgpu::bitset can be set concurrently in a GPU kernel. + // + + const stdgpu::index_t n = 100; + + int* d_input = createDeviceArray(n); + int* d_result = createDeviceArray(n / 2); + stdgpu::bitset<> bits = stdgpu::bitset<>::createDeviceObject(n); + stdgpu::atomic counter = stdgpu::atomic::createDeviceObject(); + + thrust::sequence(stdgpu::device_begin(d_input), stdgpu::device_end(d_input), 1); + + // d_input : 1, 2, 3, ..., 100 + + thrust::copy_if(stdgpu::device_cbegin(d_input), + stdgpu::device_cend(d_input), + stdgpu::device_begin(d_result), + is_odd()); + + // d_result : 1, 3, 5, ..., 99 + + // bits : 000000..00 + + stdgpu::index_t threads = 32; + stdgpu::index_t blocks = ((n / 2) + threads - 1) / threads; + + counter.store(0); + + set_bits<<(blocks), static_cast(threads)>>>(d_result, n / 2, bits, counter); + HIP_CHECK(hipDeviceSynchronize()); + + // bits : 010101...01 + + std::cout << "First run: The number of set bits is " << bits.count() << " (" << n / 2 << " expected; " + << counter.load() << " of those previously unset)" << std::endl; + + counter.store(0); + + set_bits<<(blocks), static_cast(threads)>>>(d_result, n / 2, bits, counter); + HIP_CHECK(hipDeviceSynchronize()); + + // bits : 010101...01 + + std::cout << "Second run: The number of set bits is " << bits.count() << " (" << n / 2 << " expected; " + << counter.load() << " of those previously unset)" << std::endl; + + destroyDeviceArray(d_input); + destroyDeviceArray(d_result); + stdgpu::bitset<>::destroyDeviceObject(bits); + stdgpu::atomic::destroyDeviceObject(counter); +} diff --git a/examples/hip/deque.hip b/examples/hip/deque.hip new file mode 100644 index 000000000..b92d4e62c --- /dev/null +++ b/examples/hip/deque.hip @@ -0,0 +1,110 @@ +/* + * Copyright 2025 Patrick Stotko + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include // stdgpu::deque +#include // device_begin, device_end +#include // createDeviceArray, destroyDeviceArray +#include // STDGPU_HOST_DEVICE + +void +hip_check_error(hipError_t status, const char* file, int line) +{ + if (status != hipSuccess) + { + std::cerr << "HIP Error: " << hipGetErrorString(status) << " at " << file << ":" << line << std::endl; + } +} + +#define HIP_CHECK(...) hip_check_error((__VA_ARGS__), __FILE__, __LINE__) + +struct is_odd +{ + STDGPU_HOST_DEVICE bool + operator()(const int x) const + { + return x % 2 == 1; + } +}; + +__global__ void +insert_neighbors_with_duplicates(const int* d_input, const stdgpu::index_t n, stdgpu::deque deq) +{ + stdgpu::index_t i = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + + if (i >= n) + return; + + int num = d_input[i]; + int num_neighborhood[3] = { num - 1, num, num + 1 }; + + is_odd odd; + for (int num_neighbor : num_neighborhood) + { + if (odd(num_neighbor)) + { + deq.push_back(num_neighbor); + } + else + { + deq.push_front(num_neighbor); + } + } +} + +int +main() +{ + // + // EXAMPLE DESCRIPTION + // ------------------- + // This example demonstrates how stdgpu::deque is used to compute a set of duplicated numbers. + // Every number is contained 3 times, except for the first and last one. + // Furthermore, even numbers are put into the front, whereas odd number are put into the back. + // + + const stdgpu::index_t n = 100; + + int* d_input = createDeviceArray(n); + stdgpu::deque deq = stdgpu::deque::createDeviceObject(3 * n); + + thrust::sequence(stdgpu::device_begin(d_input), stdgpu::device_end(d_input), 1); + + // d_input : 1, 2, 3, ..., 100 + + stdgpu::index_t threads = 32; + stdgpu::index_t blocks = (n + threads - 1) / threads; + insert_neighbors_with_duplicates<<(blocks), static_cast(threads)>>>(d_input, + n, + deq); + HIP_CHECK(hipDeviceSynchronize()); + + // deq : 0, 1, 1, 2, 2, 2, 3, 3, 3, ..., 99, 99, 99, 100, 100, 101 + + auto range_deq = deq.device_range(); + int sum = thrust::reduce(range_deq.begin(), range_deq.end(), 0, thrust::plus()); + + const int sum_closed_form = 3 * (n * (n + 1) / 2); + + std::cout << "The set of duplicated numbers contains " << deq.size() << " elements (" << 3 * n + << " expected) and the computed sum is " << sum << " (" << sum_closed_form << " expected)" << std::endl; + + destroyDeviceArray(d_input); + stdgpu::deque::destroyDeviceObject(deq); +} diff --git a/examples/hip/mutex_array.hip b/examples/hip/mutex_array.hip new file mode 100644 index 000000000..fadf331c3 --- /dev/null +++ b/examples/hip/mutex_array.hip @@ -0,0 +1,118 @@ +/* + * Copyright 2025 Patrick Stotko + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include // stdgpu::atomic +#include // device_begin, device_end +#include // createDeviceArray, destroyDeviceArray +#include // stdgpu::mutex_array +#include // STDGPU_HOST_DEVICE +#include // stdgpu::vector + +void +hip_check_error(hipError_t status, const char* file, int line) +{ + if (status != hipSuccess) + { + std::cerr << "HIP Error: " << hipGetErrorString(status) << " at " << file << ":" << line << std::endl; + } +} + +#define HIP_CHECK(...) hip_check_error((__VA_ARGS__), __FILE__, __LINE__) + +struct is_odd +{ + STDGPU_HOST_DEVICE bool + operator()(const int x) const + { + return x % 2 == 1; + } +}; + +__global__ void +try_partial_sum(const int* d_input, const stdgpu::index_t n, stdgpu::mutex_array<> locks, int* d_result) +{ + stdgpu::index_t i = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + + if (i >= n) + return; + + stdgpu::index_t j = i % locks.size(); + + // While loops might hang due to internal driver scheduling, so use a fixed number of trials. + // Do not loop over try_lock(). Instead, loop over the whole sequential part to avoid deadlocks. + bool finished = false; + const stdgpu::index_t number_trials = 5; + for (stdgpu::index_t k = 0; k < number_trials; ++k) + { + // --- SEQUENTIAL PART --- + if (!finished && locks[j].try_lock()) + { + // START --- critical section --- START + + d_result[j] += d_input[i]; + + // END --- critical section --- END + locks[j].unlock(); + finished = true; + } + // --- SEQUENTIAL PART --- + } +} + +int +main() +{ + // + // EXAMPLE DESCRIPTION + // ------------------- + // This example demonstrates how stdgpu::mutex_array can be used to implement spin locks on the GPU. + // Since the correct usage still comes with many implications, this example is oversimplified and just shows the + // deadlock-free looping. + // + + const stdgpu::index_t n = 100; + const stdgpu::index_t m = 10; + + int* d_input = createDeviceArray(n); + int* d_result = createDeviceArray(m); + stdgpu::mutex_array<> locks = stdgpu::mutex_array<>::createDeviceObject(m); + + thrust::sequence(stdgpu::device_begin(d_input), stdgpu::device_end(d_input), 1); + + // d_input : 1, 2, 3, ..., 100 + + stdgpu::index_t threads = 32; + stdgpu::index_t blocks = (n + threads - 1) / threads; + try_partial_sum<<(blocks), static_cast(threads)>>>(d_input, + n, + locks, + d_result); + HIP_CHECK(hipDeviceSynchronize()); + + int sum = thrust::reduce(stdgpu::device_cbegin(d_result), stdgpu::device_cend(d_result), 0, thrust::plus()); + + const int sum_closed_form = n * (n + 1) / 2; + + std::cout << "The sum of all partially computed sums (via mutex locks) is " << sum + << " which intentionally might not match the expected value of " << sum_closed_form << std::endl; + + destroyDeviceArray(d_input); + destroyDeviceArray(d_result); + stdgpu::mutex_array<>::destroyDeviceObject(locks); +} diff --git a/examples/hip/unordered_map.hip b/examples/hip/unordered_map.hip new file mode 100644 index 000000000..943300fb9 --- /dev/null +++ b/examples/hip/unordered_map.hip @@ -0,0 +1,128 @@ +/* + * Copyright 2025 Patrick Stotko + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include // device_begin, device_end +#include // createDeviceArray, destroyDeviceArray +#include // STDGPU_HOST_DEVICE +#include // stdgpu::unordered_map + +void +hip_check_error(hipError_t status, const char* file, int line) +{ + if (status != hipSuccess) + { + std::cerr << "HIP Error: " << hipGetErrorString(status) << " at " << file << ":" << line << std::endl; + } +} + +#define HIP_CHECK(...) hip_check_error((__VA_ARGS__), __FILE__, __LINE__) + +struct is_odd +{ + STDGPU_HOST_DEVICE bool + operator()(const int x) const + { + return x % 2 == 1; + } +}; + +struct square +{ + STDGPU_HOST_DEVICE int + operator()(const int x) const + { + return x * x; + } +}; + +struct int_pair_plus +{ + STDGPU_HOST_DEVICE stdgpu::pair + operator()(const stdgpu::pair& lhs, const stdgpu::pair& rhs) const + { + return { lhs.first + rhs.first, lhs.second + rhs.second }; + } +}; + +__global__ void +insert_neighbors(const int* d_result, const stdgpu::index_t n, stdgpu::unordered_map map) +{ + stdgpu::index_t i = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + + if (i >= n) + return; + + int num = d_result[i]; + int num_neighborhood[3] = { num - 1, num, num + 1 }; + + for (int num_neighbor : num_neighborhood) + { + map.emplace(num_neighbor, square()(num_neighbor)); + } +} + +int +main() +{ + // + // EXAMPLE DESCRIPTION + // ------------------- + // This example demonstrates how stdgpu::unordered_map is used to compute a duplicate-free set of numbers. + // + + const stdgpu::index_t n = 100; + + int* d_input = createDeviceArray(n); + int* d_result = createDeviceArray(n / 2); + stdgpu::unordered_map map = stdgpu::unordered_map::createDeviceObject(n); + + thrust::sequence(stdgpu::device_begin(d_input), stdgpu::device_end(d_input), 1); + + // d_input : 1, 2, 3, ..., 100 + + thrust::copy_if(stdgpu::device_cbegin(d_input), + stdgpu::device_cend(d_input), + stdgpu::device_begin(d_result), + is_odd()); + + // d_result : 1, 3, 5, ..., 99 + + stdgpu::index_t threads = 32; + stdgpu::index_t blocks = (n / 2 + threads - 1) / threads; + insert_neighbors<<(blocks), static_cast(threads)>>>(d_result, n / 2, map); + HIP_CHECK(hipDeviceSynchronize()); + + // map : 0, 1, 2, 3, ..., 100 + + auto range_map = map.device_range(); + stdgpu::pair sum = + thrust::reduce(range_map.begin(), range_map.end(), stdgpu::pair(0, 0), int_pair_plus()); + + const stdgpu::pair sum_closed_form = { static_cast(n * (n + 1) / 2), + static_cast(n * (n + 1) * (2 * n + 1) / 6) }; + + std::cout << "The duplicate-free map of numbers contains " << map.size() << " elements (" << n + 1 + << " expected) and the computed sums are (" << sum.first << ", " << sum.second << ") ((" + << sum_closed_form.first << ", " << sum_closed_form.second << ") expected)" << std::endl; + + destroyDeviceArray(d_input); + destroyDeviceArray(d_result); + stdgpu::unordered_map::destroyDeviceObject(map); +} diff --git a/examples/hip/unordered_set.hip b/examples/hip/unordered_set.hip new file mode 100644 index 000000000..7ed25789c --- /dev/null +++ b/examples/hip/unordered_set.hip @@ -0,0 +1,107 @@ +/* + * Copyright 2025 Patrick Stotko + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include // device_begin, device_end +#include // createDeviceArray, destroyDeviceArray +#include // STDGPU_HOST_DEVICE +#include // stdgpu::unordered_set + +void +hip_check_error(hipError_t status, const char* file, int line) +{ + if (status != hipSuccess) + { + std::cerr << "HIP Error: " << hipGetErrorString(status) << " at " << file << ":" << line << std::endl; + } +} + +#define HIP_CHECK(...) hip_check_error((__VA_ARGS__), __FILE__, __LINE__) + +struct is_odd +{ + STDGPU_HOST_DEVICE bool + operator()(const int x) const + { + return x % 2 == 1; + } +}; + +__global__ void +insert_neighbors(const int* d_result, const stdgpu::index_t n, stdgpu::unordered_set set) +{ + stdgpu::index_t i = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + + if (i >= n) + return; + + int num = d_result[i]; + int num_neighborhood[3] = { num - 1, num, num + 1 }; + + for (int num_neighbor : num_neighborhood) + { + set.insert(num_neighbor); + } +} + +int +main() +{ + // + // EXAMPLE DESCRIPTION + // ------------------- + // This example demonstrates how stdgpu::unordered_set is used to compute a duplicate-free set of numbers. + // + + const stdgpu::index_t n = 100; + + int* d_input = createDeviceArray(n); + int* d_result = createDeviceArray(n / 2); + stdgpu::unordered_set set = stdgpu::unordered_set::createDeviceObject(n); + + thrust::sequence(stdgpu::device_begin(d_input), stdgpu::device_end(d_input), 1); + + // d_input : 1, 2, 3, ..., 100 + + thrust::copy_if(stdgpu::device_cbegin(d_input), + stdgpu::device_cend(d_input), + stdgpu::device_begin(d_result), + is_odd()); + + // d_result : 1, 3, 5, ..., 99 + + stdgpu::index_t threads = 32; + stdgpu::index_t blocks = (n / 2 + threads - 1) / threads; + insert_neighbors<<(blocks), static_cast(threads)>>>(d_result, n / 2, set); + HIP_CHECK(hipDeviceSynchronize()); + + // set : 0, 1, 2, 3, ..., 100 + + auto range_set = set.device_range(); + int sum = thrust::reduce(range_set.begin(), range_set.end(), 0, thrust::plus()); + + const int sum_closed_form = n * (n + 1) / 2; + + std::cout << "The duplicate-free set of numbers contains " << set.size() << " elements (" << n + 1 + << " expected) and the computed sum is " << sum << " (" << sum_closed_form << " expected)" << std::endl; + + destroyDeviceArray(d_input); + destroyDeviceArray(d_result); + stdgpu::unordered_set::destroyDeviceObject(set); +} diff --git a/examples/hip/vector.hip b/examples/hip/vector.hip new file mode 100644 index 000000000..42d1272db --- /dev/null +++ b/examples/hip/vector.hip @@ -0,0 +1,92 @@ +/* + * Copyright 2025 Patrick Stotko + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include // device_begin, device_end +#include // createDeviceArray, destroyDeviceArray +#include // STDGPU_HOST_DEVICE +#include // stdgpu::vector + +void +hip_check_error(hipError_t status, const char* file, int line) +{ + if (status != hipSuccess) + { + std::cerr << "HIP Error: " << hipGetErrorString(status) << " at " << file << ":" << line << std::endl; + } +} + +#define HIP_CHECK(...) hip_check_error((__VA_ARGS__), __FILE__, __LINE__) + +__global__ void +insert_neighbors_with_duplicates(const int* d_input, const stdgpu::index_t n, stdgpu::vector vec) +{ + stdgpu::index_t i = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + + if (i >= n) + return; + + int num = d_input[i]; + int num_neighborhood[3] = { num - 1, num, num + 1 }; + + for (int num_neighbor : num_neighborhood) + { + vec.push_back(num_neighbor); + } +} + +int +main() +{ + // + // EXAMPLE DESCRIPTION + // ------------------- + // This example demonstrates how stdgpu::vector is used to compute a set of duplicated numbers. + // Every number is contained 3 times, except for the first and last one which is contained only 2 times. + // + + const stdgpu::index_t n = 100; + + int* d_input = createDeviceArray(n); + stdgpu::vector vec = stdgpu::vector::createDeviceObject(3 * n); + + thrust::sequence(stdgpu::device_begin(d_input), stdgpu::device_end(d_input), 1); + + // d_input : 1, 2, 3, ..., 100 + + stdgpu::index_t threads = 32; + stdgpu::index_t blocks = (n + threads - 1) / threads; + insert_neighbors_with_duplicates<<(blocks), static_cast(threads)>>>(d_input, + n, + vec); + HIP_CHECK(hipDeviceSynchronize()); + + // vec : 0, 1, 1, 2, 2, 2, 3, 3, 3, ..., 99, 99, 99, 100, 100, 101 + + auto range_vec = vec.device_range(); + int sum = thrust::reduce(range_vec.begin(), range_vec.end(), 0, thrust::plus()); + + const int sum_closed_form = 3 * (n * (n + 1) / 2); + + std::cout << "The set of duplicated numbers contains " << vec.size() << " elements (" << 3 * n + << " expected) and the computed sum is " << sum << " (" << sum_closed_form << " expected)" << std::endl; + + destroyDeviceArray(d_input); + stdgpu::vector::destroyDeviceObject(vec); +}