Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions examples/hip/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,11 @@ macro(stdgpu_add_example_hip)
endmacro()

stdgpu_add_example_hip(atomic)
stdgpu_add_example_hip(bitset)
stdgpu_add_example_hip(deque)
stdgpu_add_example_hip(iterator)
stdgpu_add_example_hip(mutex_array)
stdgpu_add_example_hip(ranges)
stdgpu_add_example_hip(unordered_map)
stdgpu_add_example_hip(unordered_set)
stdgpu_add_example_hip(vector)
118 changes: 118 additions & 0 deletions examples/hip/bitset.hip
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
/*
* Copyright 2025 Patrick Stotko
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <iostream>
#include <thrust/copy.h>
#include <thrust/sequence.h>

#include <stdgpu/atomic.cuh> // stdgpu::atomic
#include <stdgpu/bitset.cuh> // stdgpu::bitset
#include <stdgpu/iterator.h> // device_begin, device_end
#include <stdgpu/memory.h> // createDeviceArray, destroyDeviceArray
#include <stdgpu/platform.h> // STDGPU_HOST_DEVICE

void
hip_check_error(hipError_t status, const char* file, int line)
{
if (status != hipSuccess)
{
std::cerr << "HIP Error: " << hipGetErrorString(status) << " at " << file << ":" << line << std::endl;
}
}

#define HIP_CHECK(...) hip_check_error((__VA_ARGS__), __FILE__, __LINE__)

struct is_odd
{
STDGPU_HOST_DEVICE bool
operator()(const int x) const
{
return x % 2 == 1;
}
};

__global__ void
set_bits(const int* d_result, const stdgpu::index_t d_result_size, stdgpu::bitset<> bits, stdgpu::atomic<int> counter)
{
stdgpu::index_t i = static_cast<stdgpu::index_t>(blockIdx.x * blockDim.x + threadIdx.x);

if (i >= d_result_size)
return;

bool was_set = bits.set(d_result[i]);

if (!was_set)
{
++counter;
}
}

int
main()
{
//
// EXAMPLE DESCRIPTION
// -------------------
// This example shows how every second bit of stdgpu::bitset can be set concurrently in a GPU kernel.
//

const stdgpu::index_t n = 100;

int* d_input = createDeviceArray<int>(n);
int* d_result = createDeviceArray<int>(n / 2);
stdgpu::bitset<> bits = stdgpu::bitset<>::createDeviceObject(n);
stdgpu::atomic<int> counter = stdgpu::atomic<int>::createDeviceObject();

thrust::sequence(stdgpu::device_begin(d_input), stdgpu::device_end(d_input), 1);

// d_input : 1, 2, 3, ..., 100

thrust::copy_if(stdgpu::device_cbegin(d_input),
stdgpu::device_cend(d_input),
stdgpu::device_begin(d_result),
is_odd());

// d_result : 1, 3, 5, ..., 99

// bits : 000000..00

stdgpu::index_t threads = 32;
stdgpu::index_t blocks = ((n / 2) + threads - 1) / threads;

counter.store(0);

set_bits<<<static_cast<unsigned int>(blocks), static_cast<unsigned int>(threads)>>>(d_result, n / 2, bits, counter);
HIP_CHECK(hipDeviceSynchronize());

// bits : 010101...01

std::cout << "First run: The number of set bits is " << bits.count() << " (" << n / 2 << " expected; "
<< counter.load() << " of those previously unset)" << std::endl;

counter.store(0);

set_bits<<<static_cast<unsigned int>(blocks), static_cast<unsigned int>(threads)>>>(d_result, n / 2, bits, counter);
HIP_CHECK(hipDeviceSynchronize());

// bits : 010101...01

std::cout << "Second run: The number of set bits is " << bits.count() << " (" << n / 2 << " expected; "
<< counter.load() << " of those previously unset)" << std::endl;

destroyDeviceArray<int>(d_input);
destroyDeviceArray<int>(d_result);
stdgpu::bitset<>::destroyDeviceObject(bits);
stdgpu::atomic<int>::destroyDeviceObject(counter);
}
110 changes: 110 additions & 0 deletions examples/hip/deque.hip
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
/*
* Copyright 2025 Patrick Stotko
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <iostream>
#include <thrust/copy.h>
#include <thrust/reduce.h>
#include <thrust/sequence.h>

#include <stdgpu/deque.cuh> // stdgpu::deque
#include <stdgpu/iterator.h> // device_begin, device_end
#include <stdgpu/memory.h> // createDeviceArray, destroyDeviceArray
#include <stdgpu/platform.h> // STDGPU_HOST_DEVICE

void
hip_check_error(hipError_t status, const char* file, int line)
{
if (status != hipSuccess)
{
std::cerr << "HIP Error: " << hipGetErrorString(status) << " at " << file << ":" << line << std::endl;
}
}

#define HIP_CHECK(...) hip_check_error((__VA_ARGS__), __FILE__, __LINE__)

struct is_odd
{
STDGPU_HOST_DEVICE bool
operator()(const int x) const
{
return x % 2 == 1;
}
};

__global__ void
insert_neighbors_with_duplicates(const int* d_input, const stdgpu::index_t n, stdgpu::deque<int> deq)
{
stdgpu::index_t i = static_cast<stdgpu::index_t>(blockIdx.x * blockDim.x + threadIdx.x);

if (i >= n)
return;

int num = d_input[i];
int num_neighborhood[3] = { num - 1, num, num + 1 };

is_odd odd;
for (int num_neighbor : num_neighborhood)
{
if (odd(num_neighbor))
{
deq.push_back(num_neighbor);
}
else
{
deq.push_front(num_neighbor);
}
}
}

int
main()
{
//
// EXAMPLE DESCRIPTION
// -------------------
// This example demonstrates how stdgpu::deque is used to compute a set of duplicated numbers.
// Every number is contained 3 times, except for the first and last one.
// Furthermore, even numbers are put into the front, whereas odd number are put into the back.
//

const stdgpu::index_t n = 100;

int* d_input = createDeviceArray<int>(n);
stdgpu::deque<int> deq = stdgpu::deque<int>::createDeviceObject(3 * n);

thrust::sequence(stdgpu::device_begin(d_input), stdgpu::device_end(d_input), 1);

// d_input : 1, 2, 3, ..., 100

stdgpu::index_t threads = 32;
stdgpu::index_t blocks = (n + threads - 1) / threads;
insert_neighbors_with_duplicates<<<static_cast<unsigned int>(blocks), static_cast<unsigned int>(threads)>>>(d_input,
n,
deq);
HIP_CHECK(hipDeviceSynchronize());

// deq : 0, 1, 1, 2, 2, 2, 3, 3, 3, ..., 99, 99, 99, 100, 100, 101

auto range_deq = deq.device_range();
int sum = thrust::reduce(range_deq.begin(), range_deq.end(), 0, thrust::plus<int>());

const int sum_closed_form = 3 * (n * (n + 1) / 2);

std::cout << "The set of duplicated numbers contains " << deq.size() << " elements (" << 3 * n
<< " expected) and the computed sum is " << sum << " (" << sum_closed_form << " expected)" << std::endl;

destroyDeviceArray<int>(d_input);
stdgpu::deque<int>::destroyDeviceObject(deq);
}
118 changes: 118 additions & 0 deletions examples/hip/mutex_array.hip
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
/*
* Copyright 2025 Patrick Stotko
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <iostream>
#include <thrust/reduce.h>
#include <thrust/sequence.h>

#include <stdgpu/atomic.cuh> // stdgpu::atomic
#include <stdgpu/iterator.h> // device_begin, device_end
#include <stdgpu/memory.h> // createDeviceArray, destroyDeviceArray
#include <stdgpu/mutex.cuh> // stdgpu::mutex_array
#include <stdgpu/platform.h> // STDGPU_HOST_DEVICE
#include <stdgpu/vector.cuh> // stdgpu::vector

void
hip_check_error(hipError_t status, const char* file, int line)
{
if (status != hipSuccess)
{
std::cerr << "HIP Error: " << hipGetErrorString(status) << " at " << file << ":" << line << std::endl;
}
}

#define HIP_CHECK(...) hip_check_error((__VA_ARGS__), __FILE__, __LINE__)

struct is_odd
{
STDGPU_HOST_DEVICE bool
operator()(const int x) const
{
return x % 2 == 1;
}
};

__global__ void
try_partial_sum(const int* d_input, const stdgpu::index_t n, stdgpu::mutex_array<> locks, int* d_result)
{
stdgpu::index_t i = static_cast<stdgpu::index_t>(blockIdx.x * blockDim.x + threadIdx.x);

if (i >= n)
return;

stdgpu::index_t j = i % locks.size();

// While loops might hang due to internal driver scheduling, so use a fixed number of trials.
// Do not loop over try_lock(). Instead, loop over the whole sequential part to avoid deadlocks.
bool finished = false;
const stdgpu::index_t number_trials = 5;
for (stdgpu::index_t k = 0; k < number_trials; ++k)
{
// --- SEQUENTIAL PART ---
if (!finished && locks[j].try_lock())
{
// START --- critical section --- START

d_result[j] += d_input[i];

// END --- critical section --- END
locks[j].unlock();
finished = true;
}
// --- SEQUENTIAL PART ---
}
}

int
main()
{
//
// EXAMPLE DESCRIPTION
// -------------------
// This example demonstrates how stdgpu::mutex_array can be used to implement spin locks on the GPU.
// Since the correct usage still comes with many implications, this example is oversimplified and just shows the
// deadlock-free looping.
//

const stdgpu::index_t n = 100;
const stdgpu::index_t m = 10;

int* d_input = createDeviceArray<int>(n);
int* d_result = createDeviceArray<int>(m);
stdgpu::mutex_array<> locks = stdgpu::mutex_array<>::createDeviceObject(m);

thrust::sequence(stdgpu::device_begin(d_input), stdgpu::device_end(d_input), 1);

// d_input : 1, 2, 3, ..., 100

stdgpu::index_t threads = 32;
stdgpu::index_t blocks = (n + threads - 1) / threads;
try_partial_sum<<<static_cast<unsigned int>(blocks), static_cast<unsigned int>(threads)>>>(d_input,
n,
locks,
d_result);
HIP_CHECK(hipDeviceSynchronize());

int sum = thrust::reduce(stdgpu::device_cbegin(d_result), stdgpu::device_cend(d_result), 0, thrust::plus<int>());

const int sum_closed_form = n * (n + 1) / 2;

std::cout << "The sum of all partially computed sums (via mutex locks) is " << sum
<< " which intentionally might not match the expected value of " << sum_closed_form << std::endl;

destroyDeviceArray<int>(d_input);
destroyDeviceArray<int>(d_result);
stdgpu::mutex_array<>::destroyDeviceObject(locks);
}
Loading