From 740e7668dbc5439cc56f4fe67b2ed14ac18025a2 Mon Sep 17 00:00:00 2001
From: Patrick Stotko <stotko@cs.uni-bonn.de>
Date: Sun, 14 Dec 2025 16:07:29 +0100
Subject: [PATCH] examples: Add more device-specific examples to HIP backend

---
 examples/hip/CMakeLists.txt    |   6 ++
 examples/hip/bitset.hip        | 118 ++++++++++++++++++++++++++++++
 examples/hip/deque.hip         | 110 ++++++++++++++++++++++++++++
 examples/hip/mutex_array.hip   | 118 ++++++++++++++++++++++++++++++
 examples/hip/unordered_map.hip | 128 +++++++++++++++++++++++++++++++++
 examples/hip/unordered_set.hip | 107 +++++++++++++++++++++++++++
 examples/hip/vector.hip        |  92 ++++++++++++++++++++++++
 7 files changed, 679 insertions(+)
 create mode 100644 examples/hip/bitset.hip
 create mode 100644 examples/hip/deque.hip
 create mode 100644 examples/hip/mutex_array.hip
 create mode 100644 examples/hip/unordered_map.hip
 create mode 100644 examples/hip/unordered_set.hip
 create mode 100644 examples/hip/vector.hip
diff --git a/examples/hip/CMakeLists.txt b/examples/hip/CMakeLists.txt
index 60a4ad81c..77d9d3de1 100644
--- a/examples/hip/CMakeLists.txt
+++ b/examples/hip/CMakeLists.txt
@@ -4,5 +4,11 @@ macro(stdgpu_add_example_hip)
 endmacro()
 
 stdgpu_add_example_hip(atomic)
+stdgpu_add_example_hip(bitset)
+stdgpu_add_example_hip(deque)
 stdgpu_add_example_hip(iterator)
+stdgpu_add_example_hip(mutex_array)
 stdgpu_add_example_hip(ranges)
+stdgpu_add_example_hip(unordered_map)
+stdgpu_add_example_hip(unordered_set)
+stdgpu_add_example_hip(vector)
diff --git a/examples/hip/bitset.hip b/examples/hip/bitset.hip
new file mode 100644
index 000000000..309190d58
--- /dev/null
+++ b/examples/hip/bitset.hip
@@ -0,0 +1,118 @@
+/*
+ *  Copyright 2025 Patrick Stotko
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <iostream>
+#include <thrust/copy.h>
+#include <thrust/sequence.h>
+
+#include <stdgpu/atomic.cuh> // stdgpu::atomic
+#include <stdgpu/bitset.cuh> // stdgpu::bitset
+#include <stdgpu/iterator.h> // device_begin, device_end
+#include <stdgpu/memory.h>   // createDeviceArray, destroyDeviceArray
+#include <stdgpu/platform.h> // STDGPU_HOST_DEVICE
+
+void
+hip_check_error(hipError_t status, const char* file, int line)
+{
+    if (status != hipSuccess)
+    {
+        std::cerr << "HIP Error: " << hipGetErrorString(status) << " at " << file << ":" << line << std::endl;
+    }
+}
+
+#define HIP_CHECK(...) hip_check_error((__VA_ARGS__), __FILE__, __LINE__)
+
+struct is_odd
+{
+    STDGPU_HOST_DEVICE bool
+    operator()(const int x) const
+    {
+        return x % 2 == 1;
+    }
+};
+
+__global__ void
+set_bits(const int* d_result, const stdgpu::index_t d_result_size, stdgpu::bitset<> bits, stdgpu::atomic<int> counter)
+{
+    stdgpu::index_t i = static_cast<stdgpu::index_t>(blockIdx.x * blockDim.x + threadIdx.x);
+
+    if (i >= d_result_size)
+        return;
+
+    bool was_set = bits.set(d_result[i]);
+
+    if (!was_set)
+    {
+        ++counter;
+    }
+}
+
+int
+main()
+{
+    //
+    // EXAMPLE DESCRIPTION
+    // -------------------
+    // This example shows how every second bit of stdgpu::bitset can be set concurrently in a GPU kernel.
+    //
+
+    const stdgpu::index_t n = 100;
+
+    int* d_input = createDeviceArray<int>(n);
+    int* d_result = createDeviceArray<int>(n / 2);
+    stdgpu::bitset<> bits = stdgpu::bitset<>::createDeviceObject(n);
+    stdgpu::atomic<int> counter = stdgpu::atomic<int>::createDeviceObject();
+
+    thrust::sequence(stdgpu::device_begin(d_input), stdgpu::device_end(d_input), 1);
+
+    // d_input : 1, 2, 3, ..., 100
+
+    thrust::copy_if(stdgpu::device_cbegin(d_input),
+                    stdgpu::device_cend(d_input),
+                    stdgpu::device_begin(d_result),
+                    is_odd());
+
+    // d_result : 1, 3, 5, ..., 99
+
+    // bits : 000000..00
+
+    stdgpu::index_t threads = 32;
+    stdgpu::index_t blocks = ((n / 2) + threads - 1) / threads;
+
+    counter.store(0);
+
+    set_bits<<<static_cast<unsigned int>(blocks), static_cast<unsigned int>(threads)>>>(d_result, n / 2, bits, counter);
+    HIP_CHECK(hipDeviceSynchronize());
+
+    // bits : 010101...01
+
+    std::cout << "First run: The number of set bits is " << bits.count() << " (" << n / 2 << " expected; "
+              << counter.load() << " of those previously unset)" << std::endl;
+
+    counter.store(0);
+
+    set_bits<<<static_cast<unsigned int>(blocks), static_cast<unsigned int>(threads)>>>(d_result, n / 2, bits, counter);
+    HIP_CHECK(hipDeviceSynchronize());
+
+    // bits : 010101...01
+
+    std::cout << "Second run: The number of set bits is " << bits.count() << " (" << n / 2 << " expected; "
+              << counter.load() << " of those previously unset)" << std::endl;
+
+    destroyDeviceArray<int>(d_input);
+    destroyDeviceArray<int>(d_result);
+    stdgpu::bitset<>::destroyDeviceObject(bits);
+    stdgpu::atomic<int>::destroyDeviceObject(counter);
+}
diff --git a/examples/hip/deque.hip b/examples/hip/deque.hip
new file mode 100644
index 000000000..b92d4e62c
--- /dev/null
+++ b/examples/hip/deque.hip
@@ -0,0 +1,110 @@
+/*
+ *  Copyright 2025 Patrick Stotko
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <iostream>
+#include <thrust/copy.h>
+#include <thrust/reduce.h>
+#include <thrust/sequence.h>
+
+#include <stdgpu/deque.cuh>  // stdgpu::deque
+#include <stdgpu/iterator.h> // device_begin, device_end
+#include <stdgpu/memory.h>   // createDeviceArray, destroyDeviceArray
+#include <stdgpu/platform.h> // STDGPU_HOST_DEVICE
+
+void
+hip_check_error(hipError_t status, const char* file, int line)
+{
+    if (status != hipSuccess)
+    {
+        std::cerr << "HIP Error: " << hipGetErrorString(status) << " at " << file << ":" << line << std::endl;
+    }
+}
+
+#define HIP_CHECK(...) hip_check_error((__VA_ARGS__), __FILE__, __LINE__)
+
+struct is_odd
+{
+    STDGPU_HOST_DEVICE bool
+    operator()(const int x) const
+    {
+        return x % 2 == 1;
+    }
+};
+
+__global__ void
+insert_neighbors_with_duplicates(const int* d_input, const stdgpu::index_t n, stdgpu::deque<int> deq)
+{
+    stdgpu::index_t i = static_cast<stdgpu::index_t>(blockIdx.x * blockDim.x + threadIdx.x);
+
+    if (i >= n)
+        return;
+
+    int num = d_input[i];
+    int num_neighborhood[3] = { num - 1, num, num + 1 };
+
+    is_odd odd;
+    for (int num_neighbor : num_neighborhood)
+    {
+        if (odd(num_neighbor))
+        {
+            deq.push_back(num_neighbor);
+        }
+        else
+        {
+            deq.push_front(num_neighbor);
+        }
+    }
+}
+
+int
+main()
+{
+    //
+    // EXAMPLE DESCRIPTION
+    // -------------------
+    // This example demonstrates how stdgpu::deque is used to compute a set of duplicated numbers.
+    // Every number is contained 3 times, except for the first and last one.
+    // Furthermore, even numbers are put into the front, whereas odd number are put into the back.
+    //
+
+    const stdgpu::index_t n = 100;
+
+    int* d_input = createDeviceArray<int>(n);
+    stdgpu::deque<int> deq = stdgpu::deque<int>::createDeviceObject(3 * n);
+
+    thrust::sequence(stdgpu::device_begin(d_input), stdgpu::device_end(d_input), 1);
+
+    // d_input : 1, 2, 3, ..., 100
+
+    stdgpu::index_t threads = 32;
+    stdgpu::index_t blocks = (n + threads - 1) / threads;
+    insert_neighbors_with_duplicates<<<static_cast<unsigned int>(blocks), static_cast<unsigned int>(threads)>>>(d_input,
+                                                                                                                n,
+                                                                                                                deq);
+    HIP_CHECK(hipDeviceSynchronize());
+
+    // deq : 0, 1, 1, 2, 2, 2, 3, 3, 3,  ..., 99, 99, 99, 100, 100, 101
+
+    auto range_deq = deq.device_range();
+    int sum = thrust::reduce(range_deq.begin(), range_deq.end(), 0, thrust::plus<int>());
+
+    const int sum_closed_form = 3 * (n * (n + 1) / 2);
+
+    std::cout << "The set of duplicated numbers contains " << deq.size() << " elements (" << 3 * n
+              << " expected) and the computed sum is " << sum << " (" << sum_closed_form << " expected)" << std::endl;
+
+    destroyDeviceArray<int>(d_input);
+    stdgpu::deque<int>::destroyDeviceObject(deq);
+}
diff --git a/examples/hip/mutex_array.hip b/examples/hip/mutex_array.hip
new file mode 100644
index 000000000..fadf331c3
--- /dev/null
+++ b/examples/hip/mutex_array.hip
@@ -0,0 +1,118 @@
+/*
+ *  Copyright 2025 Patrick Stotko
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <iostream>
+#include <thrust/reduce.h>
+#include <thrust/sequence.h>
+
+#include <stdgpu/atomic.cuh> // stdgpu::atomic
+#include <stdgpu/iterator.h> // device_begin, device_end
+#include <stdgpu/memory.h>   // createDeviceArray, destroyDeviceArray
+#include <stdgpu/mutex.cuh>  // stdgpu::mutex_array
+#include <stdgpu/platform.h> // STDGPU_HOST_DEVICE
+#include <stdgpu/vector.cuh> // stdgpu::vector
+
+void
+hip_check_error(hipError_t status, const char* file, int line)
+{
+    if (status != hipSuccess)
+    {
+        std::cerr << "HIP Error: " << hipGetErrorString(status) << " at " << file << ":" << line << std::endl;
+    }
+}
+
+#define HIP_CHECK(...) hip_check_error((__VA_ARGS__), __FILE__, __LINE__)
+
+struct is_odd
+{
+    STDGPU_HOST_DEVICE bool
+    operator()(const int x) const
+    {
+        return x % 2 == 1;
+    }
+};
+
+__global__ void
+try_partial_sum(const int* d_input, const stdgpu::index_t n, stdgpu::mutex_array<> locks, int* d_result)
+{
+    stdgpu::index_t i = static_cast<stdgpu::index_t>(blockIdx.x * blockDim.x + threadIdx.x);
+
+    if (i >= n)
+        return;
+
+    stdgpu::index_t j = i % locks.size();
+
+    // While loops might hang due to internal driver scheduling, so use a fixed number of trials.
+    // Do not loop over try_lock(). Instead, loop over the whole sequential part to avoid deadlocks.
+    bool finished = false;
+    const stdgpu::index_t number_trials = 5;
+    for (stdgpu::index_t k = 0; k < number_trials; ++k)
+    {
+        // --- SEQUENTIAL PART ---
+        if (!finished && locks[j].try_lock())
+        {
+            // START --- critical section --- START
+
+            d_result[j] += d_input[i];
+
+            //  END  --- critical section ---  END
+            locks[j].unlock();
+            finished = true;
+        }
+        // --- SEQUENTIAL PART ---
+    }
+}
+
+int
+main()
+{
+    //
+    // EXAMPLE DESCRIPTION
+    // -------------------
+    // This example demonstrates how stdgpu::mutex_array can be used to implement spin locks on the GPU.
+    // Since the correct usage still comes with many implications, this example is oversimplified and just shows the
+    // deadlock-free looping.
+    //
+
+    const stdgpu::index_t n = 100;
+    const stdgpu::index_t m = 10;
+
+    int* d_input = createDeviceArray<int>(n);
+    int* d_result = createDeviceArray<int>(m);
+    stdgpu::mutex_array<> locks = stdgpu::mutex_array<>::createDeviceObject(m);
+
+    thrust::sequence(stdgpu::device_begin(d_input), stdgpu::device_end(d_input), 1);
+
+    // d_input : 1, 2, 3, ..., 100
+
+    stdgpu::index_t threads = 32;
+    stdgpu::index_t blocks = (n + threads - 1) / threads;
+    try_partial_sum<<<static_cast<unsigned int>(blocks), static_cast<unsigned int>(threads)>>>(d_input,
+                                                                                               n,
+                                                                                               locks,
+                                                                                               d_result);
+    HIP_CHECK(hipDeviceSynchronize());
+
+    int sum = thrust::reduce(stdgpu::device_cbegin(d_result), stdgpu::device_cend(d_result), 0, thrust::plus<int>());
+
+    const int sum_closed_form = n * (n + 1) / 2;
+
+    std::cout << "The sum of all partially computed sums (via mutex locks) is " << sum
+              << " which intentionally might not match the expected value of " << sum_closed_form << std::endl;
+
+    destroyDeviceArray<int>(d_input);
+    destroyDeviceArray<int>(d_result);
+    stdgpu::mutex_array<>::destroyDeviceObject(locks);
+}
diff --git a/examples/hip/unordered_map.hip b/examples/hip/unordered_map.hip
new file mode 100644
index 000000000..943300fb9
--- /dev/null
+++ b/examples/hip/unordered_map.hip
@@ -0,0 +1,128 @@
+/*
+ *  Copyright 2025 Patrick Stotko
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <iostream>
+#include <thrust/copy.h>
+#include <thrust/reduce.h>
+#include <thrust/sequence.h>
+
+#include <stdgpu/iterator.h>        // device_begin, device_end
+#include <stdgpu/memory.h>          // createDeviceArray, destroyDeviceArray
+#include <stdgpu/platform.h>        // STDGPU_HOST_DEVICE
+#include <stdgpu/unordered_map.cuh> // stdgpu::unordered_map
+
+void
+hip_check_error(hipError_t status, const char* file, int line)
+{
+    if (status != hipSuccess)
+    {
+        std::cerr << "HIP Error: " << hipGetErrorString(status) << " at " << file << ":" << line << std::endl;
+    }
+}
+
+#define HIP_CHECK(...) hip_check_error((__VA_ARGS__), __FILE__, __LINE__)
+
+struct is_odd
+{
+    STDGPU_HOST_DEVICE bool
+    operator()(const int x) const
+    {
+        return x % 2 == 1;
+    }
+};
+
+struct square
+{
+    STDGPU_HOST_DEVICE int
+    operator()(const int x) const
+    {
+        return x * x;
+    }
+};
+
+struct int_pair_plus
+{
+    STDGPU_HOST_DEVICE stdgpu::pair<int, int>
+    operator()(const stdgpu::pair<int, int>& lhs, const stdgpu::pair<int, int>& rhs) const
+    {
+        return { lhs.first + rhs.first, lhs.second + rhs.second };
+    }
+};
+
+__global__ void
+insert_neighbors(const int* d_result, const stdgpu::index_t n, stdgpu::unordered_map<int, int> map)
+{
+    stdgpu::index_t i = static_cast<stdgpu::index_t>(blockIdx.x * blockDim.x + threadIdx.x);
+
+    if (i >= n)
+        return;
+
+    int num = d_result[i];
+    int num_neighborhood[3] = { num - 1, num, num + 1 };
+
+    for (int num_neighbor : num_neighborhood)
+    {
+        map.emplace(num_neighbor, square()(num_neighbor));
+    }
+}
+
+int
+main()
+{
+    //
+    // EXAMPLE DESCRIPTION
+    // -------------------
+    // This example demonstrates how stdgpu::unordered_map is used to compute a duplicate-free set of numbers.
+    //
+
+    const stdgpu::index_t n = 100;
+
+    int* d_input = createDeviceArray<int>(n);
+    int* d_result = createDeviceArray<int>(n / 2);
+    stdgpu::unordered_map<int, int> map = stdgpu::unordered_map<int, int>::createDeviceObject(n);
+
+    thrust::sequence(stdgpu::device_begin(d_input), stdgpu::device_end(d_input), 1);
+
+    // d_input : 1, 2, 3, ..., 100
+
+    thrust::copy_if(stdgpu::device_cbegin(d_input),
+                    stdgpu::device_cend(d_input),
+                    stdgpu::device_begin(d_result),
+                    is_odd());
+
+    // d_result : 1, 3, 5, ..., 99
+
+    stdgpu::index_t threads = 32;
+    stdgpu::index_t blocks = (n / 2 + threads - 1) / threads;
+    insert_neighbors<<<static_cast<unsigned int>(blocks), static_cast<unsigned int>(threads)>>>(d_result, n / 2, map);
+    HIP_CHECK(hipDeviceSynchronize());
+
+    // map : 0, 1, 2, 3, ..., 100
+
+    auto range_map = map.device_range();
+    stdgpu::pair<int, int> sum =
+            thrust::reduce(range_map.begin(), range_map.end(), stdgpu::pair<int, int>(0, 0), int_pair_plus());
+
+    const stdgpu::pair<int, int> sum_closed_form = { static_cast<int>(n * (n + 1) / 2),
+                                                     static_cast<int>(n * (n + 1) * (2 * n + 1) / 6) };
+
+    std::cout << "The duplicate-free map of numbers contains " << map.size() << " elements (" << n + 1
+              << " expected) and the computed sums are (" << sum.first << ", " << sum.second << ") (("
+              << sum_closed_form.first << ", " << sum_closed_form.second << ") expected)" << std::endl;
+
+    destroyDeviceArray<int>(d_input);
+    destroyDeviceArray<int>(d_result);
+    stdgpu::unordered_map<int, int>::destroyDeviceObject(map);
+}
diff --git a/examples/hip/unordered_set.hip b/examples/hip/unordered_set.hip
new file mode 100644
index 000000000..7ed25789c
--- /dev/null
+++ b/examples/hip/unordered_set.hip
@@ -0,0 +1,107 @@
+/*
+ *  Copyright 2025 Patrick Stotko
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <iostream>
+#include <thrust/copy.h>
+#include <thrust/reduce.h>
+#include <thrust/sequence.h>
+
+#include <stdgpu/iterator.h>        // device_begin, device_end
+#include <stdgpu/memory.h>          // createDeviceArray, destroyDeviceArray
+#include <stdgpu/platform.h>        // STDGPU_HOST_DEVICE
+#include <stdgpu/unordered_set.cuh> // stdgpu::unordered_set
+
+void
+hip_check_error(hipError_t status, const char* file, int line)
+{
+    if (status != hipSuccess)
+    {
+        std::cerr << "HIP Error: " << hipGetErrorString(status) << " at " << file << ":" << line << std::endl;
+    }
+}
+
+#define HIP_CHECK(...) hip_check_error((__VA_ARGS__), __FILE__, __LINE__)
+
+struct is_odd
+{
+    STDGPU_HOST_DEVICE bool
+    operator()(const int x) const
+    {
+        return x % 2 == 1;
+    }
+};
+
+__global__ void
+insert_neighbors(const int* d_result, const stdgpu::index_t n, stdgpu::unordered_set<int> set)
+{
+    stdgpu::index_t i = static_cast<stdgpu::index_t>(blockIdx.x * blockDim.x + threadIdx.x);
+
+    if (i >= n)
+        return;
+
+    int num = d_result[i];
+    int num_neighborhood[3] = { num - 1, num, num + 1 };
+
+    for (int num_neighbor : num_neighborhood)
+    {
+        set.insert(num_neighbor);
+    }
+}
+
+int
+main()
+{
+    //
+    // EXAMPLE DESCRIPTION
+    // -------------------
+    // This example demonstrates how stdgpu::unordered_set is used to compute a duplicate-free set of numbers.
+    //
+
+    const stdgpu::index_t n = 100;
+
+    int* d_input = createDeviceArray<int>(n);
+    int* d_result = createDeviceArray<int>(n / 2);
+    stdgpu::unordered_set<int> set = stdgpu::unordered_set<int>::createDeviceObject(n);
+
+    thrust::sequence(stdgpu::device_begin(d_input), stdgpu::device_end(d_input), 1);
+
+    // d_input : 1, 2, 3, ..., 100
+
+    thrust::copy_if(stdgpu::device_cbegin(d_input),
+                    stdgpu::device_cend(d_input),
+                    stdgpu::device_begin(d_result),
+                    is_odd());
+
+    // d_result : 1, 3, 5, ..., 99
+
+    stdgpu::index_t threads = 32;
+    stdgpu::index_t blocks = (n / 2 + threads - 1) / threads;
+    insert_neighbors<<<static_cast<unsigned int>(blocks), static_cast<unsigned int>(threads)>>>(d_result, n / 2, set);
+    HIP_CHECK(hipDeviceSynchronize());
+
+    // set : 0, 1, 2, 3, ..., 100
+
+    auto range_set = set.device_range();
+    int sum = thrust::reduce(range_set.begin(), range_set.end(), 0, thrust::plus<int>());
+
+    const int sum_closed_form = n * (n + 1) / 2;
+
+    std::cout << "The duplicate-free set of numbers contains " << set.size() << " elements (" << n + 1
+              << " expected) and the computed sum is " << sum << " (" << sum_closed_form << " expected)" << std::endl;
+
+    destroyDeviceArray<int>(d_input);
+    destroyDeviceArray<int>(d_result);
+    stdgpu::unordered_set<int>::destroyDeviceObject(set);
+}
diff --git a/examples/hip/vector.hip b/examples/hip/vector.hip
new file mode 100644
index 000000000..42d1272db
--- /dev/null
+++ b/examples/hip/vector.hip
@@ -0,0 +1,92 @@
+/*
+ *  Copyright 2025 Patrick Stotko
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <iostream>
+#include <thrust/copy.h>
+#include <thrust/reduce.h>
+#include <thrust/sequence.h>
+
+#include <stdgpu/iterator.h> // device_begin, device_end
+#include <stdgpu/memory.h>   // createDeviceArray, destroyDeviceArray
+#include <stdgpu/platform.h> // STDGPU_HOST_DEVICE
+#include <stdgpu/vector.cuh> // stdgpu::vector
+
+void
+hip_check_error(hipError_t status, const char* file, int line)
+{
+    if (status != hipSuccess)
+    {
+        std::cerr << "HIP Error: " << hipGetErrorString(status) << " at " << file << ":" << line << std::endl;
+    }
+}
+
+#define HIP_CHECK(...) hip_check_error((__VA_ARGS__), __FILE__, __LINE__)
+
+__global__ void
+insert_neighbors_with_duplicates(const int* d_input, const stdgpu::index_t n, stdgpu::vector<int> vec)
+{
+    stdgpu::index_t i = static_cast<stdgpu::index_t>(blockIdx.x * blockDim.x + threadIdx.x);
+
+    if (i >= n)
+        return;
+
+    int num = d_input[i];
+    int num_neighborhood[3] = { num - 1, num, num + 1 };
+
+    for (int num_neighbor : num_neighborhood)
+    {
+        vec.push_back(num_neighbor);
+    }
+}
+
+int
+main()
+{
+    //
+    // EXAMPLE DESCRIPTION
+    // -------------------
+    // This example demonstrates how stdgpu::vector is used to compute a set of duplicated numbers.
+    // Every number is contained 3 times, except for the first and last one which is contained only 2 times.
+    //
+
+    const stdgpu::index_t n = 100;
+
+    int* d_input = createDeviceArray<int>(n);
+    stdgpu::vector<int> vec = stdgpu::vector<int>::createDeviceObject(3 * n);
+
+    thrust::sequence(stdgpu::device_begin(d_input), stdgpu::device_end(d_input), 1);
+
+    // d_input : 1, 2, 3, ..., 100
+
+    stdgpu::index_t threads = 32;
+    stdgpu::index_t blocks = (n + threads - 1) / threads;
+    insert_neighbors_with_duplicates<<<static_cast<unsigned int>(blocks), static_cast<unsigned int>(threads)>>>(d_input,
+                                                                                                                n,
+                                                                                                                vec);
+    HIP_CHECK(hipDeviceSynchronize());
+
+    // vec : 0, 1, 1, 2, 2, 2, 3, 3, 3,  ..., 99, 99, 99, 100, 100, 101
+
+    auto range_vec = vec.device_range();
+    int sum = thrust::reduce(range_vec.begin(), range_vec.end(), 0, thrust::plus<int>());
+
+    const int sum_closed_form = 3 * (n * (n + 1) / 2);
+
+    std::cout << "The set of duplicated numbers contains " << vec.size() << " elements (" << 3 * n
+              << " expected) and the computed sum is " << sum << " (" << sum_closed_form << " expected)" << std::endl;
+
+    destroyDeviceArray<int>(d_input);
+    stdgpu::vector<int>::destroyDeviceObject(vec);
+}