diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7104450..a39415a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -90,6 +90,7 @@ endif()
 if (ENABLE_CUSPARSE)
   set(SPBLAS_GPU_BACKEND ON)
   find_package(CUDAToolkit REQUIRED)
+  enable_language(CUDA)
   target_link_libraries(spblas INTERFACE CUDA::cudart CUDA::cusparse CUDA::cublas)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_CUSPARSE")
 endif()
diff --git a/include/spblas/vendor/cusparse/cusparse.hpp b/include/spblas/vendor/cusparse/cusparse.hpp
index 7caa698..014b2ba 100644
--- a/include/spblas/vendor/cusparse/cusparse.hpp
+++ b/include/spblas/vendor/cusparse/cusparse.hpp
@@ -1,3 +1,4 @@
 #pragma once
 
 #include "multiply.hpp"
+#include "multiply_spgemm.hpp"
diff --git a/include/spblas/vendor/cusparse/exception.hpp b/include/spblas/vendor/cusparse/exception.hpp
index d90ac30..4be80eb 100644
--- a/include/spblas/vendor/cusparse/exception.hpp
+++ b/include/spblas/vendor/cusparse/exception.hpp
@@ -10,7 +10,7 @@ namespace spblas {
 namespace __cusparse {
 
 // Throw an exception if the cudaError_t is not cudaSuccess.
-void throw_if_error(cudaError_t error_code, std::string prefix = "") {
+inline void throw_if_error(cudaError_t error_code, std::string prefix = "") {
   if (error_code == cudaSuccess) {
     return;
   }
@@ -21,7 +21,7 @@ void throw_if_error(cudaError_t error_code, std::string prefix = "") {
 }
 
 // Throw an exception if the cusparseStatus_t is not CUSPARSE_STATUS_SUCCESS.
-void throw_if_error(cusparseStatus_t error_code) {
+inline void throw_if_error(cusparseStatus_t error_code) {
   if (error_code == CUSPARSE_STATUS_SUCCESS) {
     return;
   } else if (error_code == CUSPARSE_STATUS_NOT_INITIALIZED) {
diff --git a/include/spblas/vendor/cusparse/multiply_spgemm.hpp b/include/spblas/vendor/cusparse/multiply_spgemm.hpp
new file mode 100644
index 0000000..d8b9f71
--- /dev/null
+++ b/include/spblas/vendor/cusparse/multiply_spgemm.hpp
@@ -0,0 +1,379 @@
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <type_traits>
+
+#include <cuda_runtime.h>
+#include <cusparse.h>
+
+#include <spblas/detail/ranges.hpp>
+#include <spblas/detail/view_inspectors.hpp>
+
+#include "cuda_allocator.hpp"
+#include "detail/cusparse_tensors.hpp"
+#include "exception.hpp"
+#include "types.hpp"
+
+namespace spblas {
+
+class spgemm_state_t {
+public:
+  spgemm_state_t() : spgemm_state_t(cusparse::cuda_allocator<char>{}) {}
+
+  spgemm_state_t(cusparse::cuda_allocator<char> alloc)
+      : alloc_(alloc), buffer_size_1_(0), buffer_size_2_(0), buffer_size_3_(0),
+        buffer_size_4_(0), buffer_size_5_(0), workspace_1_(nullptr),
+        workspace_2_(nullptr), workspace_3_(nullptr), workspace_4_(nullptr),
+        workspace_5_(nullptr), result_nnz_(0), result_shape_(0, 0) {
+    cusparseHandle_t handle;
+    __cusparse::throw_if_error(cusparseCreate(&handle));
+    if (auto stream = alloc.stream()) {
+      cusparseSetStream(handle, stream);
+    }
+    handle_ = handle_manager(handle, [](cusparseHandle_t handle) {
+      __cusparse::throw_if_error(cusparseDestroy(handle));
+    });
+    __cusparse::throw_if_error(cusparseSpGEMM_createDescr(&descr_));
+  }
+
+  spgemm_state_t(cusparse::cuda_allocator<char> alloc, cusparseHandle_t handle)
+      : alloc_(alloc), buffer_size_1_(0), buffer_size_2_(0),
+        workspace_1_(nullptr), workspace_2_(nullptr), result_nnz_(0),
+        result_shape_(0, 0) {
+    handle_ = handle_manager(handle, [](cusparseHandle_t handle) {
+      // it is provided by user, we do not delete it at all.
+    });
+    __cusparse::throw_if_error(cusparseSpGEMM_createDescr(&descr_));
+  }
+
+  ~spgemm_state_t() {
+    alloc_.deallocate(workspace_1_, buffer_size_1_);
+    alloc_.deallocate(workspace_2_, buffer_size_2_);
+    __cusparse::throw_if_error(cusparseDestroySpMat(mat_a_));
+    __cusparse::throw_if_error(cusparseDestroySpMat(mat_b_));
+    __cusparse::throw_if_error(cusparseDestroySpMat(mat_c_));
+    __cusparse::throw_if_error(cusparseSpGEMM_destroyDescr(descr_));
+  }
+
+  auto result_shape() {
+    return this->result_shape_;
+  }
+
+  auto result_nnz() {
+    return this->result_nnz_;
+  }
+
+  template <matrix A, matrix B, matrix C>
+    requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+             __detail::is_csr_view_v<C>
+  void multiply_compute(A&& a, B&& b, C&& c) {
+    auto a_base = __detail::get_ultimate_base(a);
+    auto b_base = __detail::get_ultimate_base(b);
+    using matrix_type = decltype(a_base);
+    using input_type = decltype(b_base);
+    using output_type = std::remove_reference_t<decltype(c)>;
+    using value_type = typename matrix_type::scalar_type;
+    size_t buffer_size = 0;
+
+    auto alpha_optional = __detail::get_scaling_factor(a, b);
+    value_type alpha = alpha_optional.value_or(1);
+    value_type beta = 1;
+    auto handle = this->handle_.get();
+    __cusparse::throw_if_error(cusparseDestroySpMat(mat_a_));
+    __cusparse::throw_if_error(cusparseDestroySpMat(mat_b_));
+    __cusparse::throw_if_error(cusparseDestroySpMat(mat_c_));
+    mat_a_ = __cusparse::create_cusparse_handle(a_base);
+    mat_b_ = __cusparse::create_cusparse_handle(b_base);
+    mat_c_ = __cusparse::create_cusparse_handle(c);
+
+    // ask bufferSize1 bytes for external memory
+    size_t buffer_size_1 = 0;
+    __cusparse::throw_if_error(cusparseSpGEMM_workEstimation(
+        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_,
+        detail::cuda_data_type_v<value_type>, CUSPARSE_SPGEMM_DEFAULT,
+        this->descr_, &buffer_size_1, NULL));
+    if (buffer_size_1 > this->buffer_size_1_) {
+      this->alloc_.deallocate(this->workspace_1_, buffer_size_1_);
+      this->buffer_size_1_ = buffer_size_1;
+      this->workspace_1_ = this->alloc_.allocate(buffer_size_1);
+    }
+    // inspect the matrices A and B to understand the memory requirement for
+    // the next step
+    __cusparse::throw_if_error(cusparseSpGEMM_workEstimation(
+        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_,
+        detail::cuda_data_type_v<value_type>, CUSPARSE_SPGEMM_DEFAULT,
+        this->descr_, &buffer_size_1, this->workspace_1_));
+
+    // ask buffer_size_2 bytes for external memory
+    size_t buffer_size_2 = 0;
+    __cusparse::throw_if_error(cusparseSpGEMM_compute(
+        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_,
+        detail::cuda_data_type_v<value_type>, CUSPARSE_SPGEMM_DEFAULT,
+        this->descr_, &buffer_size_2, NULL));
+    if (buffer_size_2 > this->buffer_size_2_) {
+      this->alloc_.deallocate(this->workspace_2_, buffer_size_2_);
+      this->buffer_size_2_ = buffer_size_2;
+      this->workspace_2_ = this->alloc_.allocate(buffer_size_2);
+    }
+
+    // compute the intermediate product of A * B
+    cusparseSpGEMM_compute(
+        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_,
+        detail::cuda_data_type_v<value_type>, CUSPARSE_SPGEMM_DEFAULT,
+        this->descr_, &buffer_size_2, this->workspace_2_);
+    // get matrix C non-zero entries c_nnz
+    int64_t c_num_rows, c_num_cols, c_nnz;
+    cusparseSpMatGetSize(mat_c_, &c_num_rows, &c_num_cols, &c_nnz);
+    this->result_nnz_ = c_nnz;
+    this->result_shape_ = index<index_t>(c_num_rows, c_num_cols);
+  }
+
+  // C = AB
+  template <matrix A, matrix B, matrix C>
+    requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+             __detail::is_csr_view_v<C>
+  void multiply_fill(A&& a, B&& b, C&& c) {
+    auto a_base = __detail::get_ultimate_base(a);
+    auto b_base = __detail::get_ultimate_base(b);
+    using matrix_type = decltype(a_base);
+    using input_type = decltype(b_base);
+    using output_type = std::remove_reference_t<decltype(c)>;
+    using value_type = typename matrix_type::scalar_type;
+
+    auto alpha_optional = __detail::get_scaling_factor(a, b);
+    value_type alpha = alpha_optional.value_or(1);
+    value_type beta = 1;
+    auto handle = this->handle_.get();
+    __cusparse::throw_if_error(cusparseCsrSetPointers(
+        mat_c_, c.rowptr().data(), c.colind().data(), c.values().data()));
+
+    __cusparse::throw_if_error(cusparseSpGEMM_copy(
+        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_,
+        detail::cuda_data_type_v<value_type>, CUSPARSE_SPGEMM_DEFAULT,
+        this->descr_));
+  }
+
+  template <matrix A, matrix B, matrix C>
+    requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+             __detail::is_csr_view_v<C>
+  void multiply_symbolic_compute(A&& a, B&& b, C&& c) {
+    auto a_base = __detail::get_ultimate_base(a);
+    auto b_base = __detail::get_ultimate_base(b);
+    using matrix_type = decltype(a_base);
+    using input_type = decltype(b_base);
+    using output_type = std::remove_reference_t<decltype(c)>;
+    using value_type = typename matrix_type::scalar_type;
+    size_t buffer_size = 0;
+
+    auto alpha_optional = __detail::get_scaling_factor(a, b);
+    value_type alpha = alpha_optional.value_or(1);
+    value_type beta = 1;
+    auto handle = this->handle_.get();
+    __cusparse::throw_if_error(cusparseDestroySpMat(mat_a_));
+    __cusparse::throw_if_error(cusparseDestroySpMat(mat_b_));
+    __cusparse::throw_if_error(cusparseDestroySpMat(mat_c_));
+    mat_a_ = __cusparse::create_cusparse_handle(a_base);
+    mat_b_ = __cusparse::create_cusparse_handle(b_base);
+    mat_c_ = __cusparse::create_cusparse_handle(c);
+
+    // ask bufferSize1 bytes for external memory
+    size_t buffer_size_1 = 0;
+    __cusparse::throw_if_error(cusparseSpGEMMreuse_workEstimation(
+        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_, mat_c_,
+        CUSPARSE_SPGEMM_DEFAULT, this->descr_, &buffer_size_1, NULL));
+    if (buffer_size_1 > this->buffer_size_1_) {
+      this->alloc_.deallocate(this->workspace_1_, buffer_size_1_);
+      this->buffer_size_1_ = buffer_size_1;
+      this->workspace_1_ = this->alloc_.allocate(buffer_size_1);
+    }
+    // inspect the matrices A and B to understand the memory requirement for
+    // the next step
+    __cusparse::throw_if_error(cusparseSpGEMMreuse_workEstimation(
+        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_, mat_c_,
+        CUSPARSE_SPGEMM_DEFAULT, this->descr_, &buffer_size_1,
+        this->workspace_1_));
+
+    // ask buffer_size_2/3/4 bytes for external memory
+    size_t buffer_size_2 = 0;
+    size_t buffer_size_3 = 0;
+    size_t buffer_size_4 = 0;
+    cusparseSpGEMMreuse_nnz(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                            CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_,
+                            mat_c_, CUSPARSE_SPGEMM_DEFAULT, this->descr_,
+                            &buffer_size_2, NULL, &buffer_size_3, NULL,
+                            &buffer_size_4, NULL);
+    if (buffer_size_2 > this->buffer_size_2_) {
+      this->alloc_.deallocate(this->workspace_2_, buffer_size_2_);
+      this->buffer_size_2_ = buffer_size_2;
+      this->workspace_2_ = this->alloc_.allocate(buffer_size_2);
+    }
+    if (buffer_size_3 > this->buffer_size_3_) {
+      this->alloc_.deallocate(this->workspace_3_, buffer_size_3_);
+      this->buffer_size_3_ = buffer_size_3;
+      this->workspace_3_ = this->alloc_.allocate(buffer_size_3);
+    }
+    if (buffer_size_4 > this->buffer_size_4_) {
+      this->alloc_.deallocate(this->workspace_4_, buffer_size_4_);
+      this->buffer_size_4_ = buffer_size_4;
+      this->workspace_4_ = this->alloc_.allocate(buffer_size_4);
+    }
+
+    // compute nnz
+    cusparseSpGEMMreuse_nnz(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                            CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_,
+                            mat_c_, CUSPARSE_SPGEMM_DEFAULT, this->descr_,
+                            &buffer_size_2, this->workspace_2_, &buffer_size_3,
+                            this->workspace_3_, &buffer_size_4,
+                            this->workspace_4_);
+    // get matrix C non-zero entries c_nnz
+    int64_t c_num_rows, c_num_cols, c_nnz;
+    cusparseSpMatGetSize(mat_c_, &c_num_rows, &c_num_cols, &c_nnz);
+    this->result_nnz_ = c_nnz;
+    this->result_shape_ = index<index_t>(c_num_rows, c_num_cols);
+  }
+
+  template <matrix A, matrix B, matrix C>
+    requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+             __detail::is_csr_view_v<C>
+  void multiply_symbolic_fill(A&& a, B&& b, C&& c) {
+    auto a_base = __detail::get_ultimate_base(a);
+    auto b_base = __detail::get_ultimate_base(b);
+    using matrix_type = decltype(a_base);
+    using input_type = decltype(b_base);
+    using output_type = std::remove_reference_t<decltype(c)>;
+    using value_type = typename matrix_type::scalar_type;
+
+    auto alpha_optional = __detail::get_scaling_factor(a, b);
+    value_type alpha = alpha_optional.value_or(1);
+    value_type beta = 0;
+
+    __cusparse::throw_if_error(cusparseCsrSetPointers(
+        this->mat_c_, c.rowptr().data(), c.colind().data(), c.values().data()));
+
+    auto handle = this->handle_.get();
+    size_t buffer_size_5 = 0;
+    cusparseSpGEMMreuse_copy(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                             CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_,
+                             mat_c_, CUSPARSE_SPGEMM_DEFAULT, this->descr_,
+                             &buffer_size_5, NULL);
+    if (buffer_size_5 > this->buffer_size_5_) {
+      this->alloc_.deallocate(this->workspace_5_, buffer_size_5_);
+      this->buffer_size_5_ = buffer_size_5;
+      this->workspace_5_ = this->alloc_.allocate(buffer_size_5);
+    }
+    cusparseSpGEMMreuse_copy(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                             CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_,
+                             mat_c_, CUSPARSE_SPGEMM_DEFAULT, this->descr_,
+                             &buffer_size_5, this->workspace_5_);
+  }
+
+  template <matrix A, matrix B, matrix C>
+    requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+             __detail::is_csr_view_v<C>
+  void multiply_numeric(A&& a, B&& b, C&& c) {
+    auto a_base = __detail::get_ultimate_base(a);
+    auto b_base = __detail::get_ultimate_base(b);
+    using matrix_type = decltype(a_base);
+    using input_type = decltype(b_base);
+    using output_type = std::remove_reference_t<decltype(c)>;
+    using value_type = typename matrix_type::scalar_type;
+
+    auto alpha_optional = __detail::get_scaling_factor(a, b);
+    tensor_scalar_t<A> alpha = alpha_optional.value_or(1);
+    value_type alpha_val = alpha;
+    value_type beta = 0;
+
+    auto handle = this->handle_.get();
+
+    // Update the pointer from the matrix but they must contains the same
+    // sparsity as the previous call.
+    __cusparse::throw_if_error(
+        cusparseCsrSetPointers(this->mat_a_, a_base.rowptr().data(),
+                               a_base.colind().data(), a_base.values().data()));
+    __cusparse::throw_if_error(
+        cusparseCsrSetPointers(this->mat_b_, b_base.rowptr().data(),
+                               b_base.colind().data(), b_base.values().data()));
+    __cusparse::throw_if_error(cusparseCsrSetPointers(
+        this->mat_c_, c.rowptr().data(), c.colind().data(), c.values().data()));
+    cusparseSpGEMMreuse_compute(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+                                mat_a_, mat_b_, &beta, mat_c_,
+                                detail::cuda_data_type_v<value_type>,
+                                CUSPARSE_SPGEMM_DEFAULT, this->descr_);
+  }
+
+private:
+  using handle_manager =
+      std::unique_ptr<std::pointer_traits<cusparseHandle_t>::element_type,
+                      std::function<void(cusparseHandle_t)>>;
+  handle_manager handle_;
+  cusparse::cuda_allocator<char> alloc_;
+  size_t buffer_size_1_;
+  size_t buffer_size_2_;
+  size_t buffer_size_3_;
+  size_t buffer_size_4_;
+  size_t buffer_size_5_;
+  char* workspace_1_;
+  char* workspace_2_;
+  char* workspace_3_;
+  char* workspace_4_;
+  char* workspace_5_;
+  index<index_t> result_shape_;
+  index_t result_nnz_;
+  cusparseSpMatDescr_t mat_a_ = nullptr;
+  cusparseSpMatDescr_t mat_b_ = nullptr;
+  cusparseSpMatDescr_t mat_c_ = nullptr;
+  cusparseSpGEMMDescr_t descr_;
+};
+
+template <matrix A, matrix B, matrix C>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C>
+void multiply_inspect(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) {}
+
+template <matrix A, matrix B, matrix C>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C>
+void multiply_compute(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) {
+  spgemm_handle.multiply_compute(a, b, c);
+}
+
+template <matrix A, matrix B, matrix C>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C>
+void multiply_fill(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) {
+  spgemm_handle.multiply_fill(a, b, c);
+}
+
+template <matrix A, matrix B, matrix C>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C>
+void multiply_symbolic_compute(spgemm_state_t& spgemm_handle, A&& a, B&& b,
+                               C&& c) {
+  spgemm_handle.multiply_symbolic_compute(a, b, c);
+}
+
+template <matrix A, matrix B, matrix C>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C>
+void multiply_symbolic_fill(spgemm_state_t& spgemm_handle, A&& a, B&& b,
+                            C&& c) {
+  spgemm_handle.multiply_symbolic_fill(a, b, c);
+}
+
+template <matrix A, matrix B, matrix C>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C>
+void multiply_numeric(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) {
+  spgemm_handle.multiply_numeric(a, b, c);
+}
+
+} // namespace spblas
diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt
index cd96de1..884f549 100644
--- a/test/gtest/CMakeLists.txt
+++ b/test/gtest/CMakeLists.txt
@@ -20,7 +20,7 @@ if (SPBLAS_GPU_BACKEND)
     set(GPUTEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp device/spgemm_reuse_test.cpp device/rocsparse/spgemm_4args_test.cpp)
     set_source_files_properties(${GPUTEST_SOURCES} PROPERTIES LANGUAGE HIP)
   else ()
-    set(GPUTEST_SOURCES device/spmv_test.cpp)
+    set(GPUTEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp device/spgemm_reuse_test.cpp)
   endif ()
   list(APPEND TEST_SOURCES ${GPUTEST_SOURCES})
 endif()
diff --git a/test/gtest/device/spgemm_reuse_test.cpp b/test/gtest/device/spgemm_reuse_test.cpp
index b798d6e..3a6e339 100644
--- a/test/gtest/device/spgemm_reuse_test.cpp
+++ b/test/gtest/device/spgemm_reuse_test.cpp
@@ -34,7 +34,8 @@ TEST(thrust_CsrView, SpGEMMReuse) {
       spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
                                                      b_colind, b_shape, b_nnz);
 
-      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+      std::vector<offset_t> c_rowptr(m + 1);
+      thrust::device_vector<offset_t> d_c_rowptr(c_rowptr);
 
       spblas::csr_view<value_t, index_t, offset_t> d_c(
           nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
@@ -42,8 +43,10 @@ TEST(thrust_CsrView, SpGEMMReuse) {
       spblas::spgemm_state_t state;
       spblas::multiply_symbolic_compute(state, d_a, d_b, d_c);
       auto nnz = state.result_nnz();
-      thrust::device_vector<value_t> d_c_values(nnz);
-      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::vector<value_t> c_values(nnz);
+      std::vector<index_t> c_colind(nnz);
+      thrust::device_vector<value_t> d_c_values(c_values);
+      thrust::device_vector<index_t> d_c_colind(c_colind);
       std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
       std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
       std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
@@ -68,9 +71,6 @@ TEST(thrust_CsrView, SpGEMMReuse) {
           thrust::copy(b_values.begin(), b_values.end(), d_b_values.begin());
         }
         spblas::multiply_numeric(state, d_a, d_b, d_c);
-        std::vector<value_t> c_values(nnz);
-        std::vector<offset_t> c_rowptr(m + 1);
-        std::vector<index_t> c_colind(nnz);
         thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin());
         thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
         thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
@@ -138,8 +138,8 @@ TEST(thrust_CsrView, SpGEMMReuse_AScaled) {
           d_b_colind.data().get(), b_shape, b_nnz);
       spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
                                                      b_colind, b_shape, b_nnz);
-
-      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+      std::vector<offset_t> c_rowptr(m + 1);
+      thrust::device_vector<offset_t> d_c_rowptr(c_rowptr);
 
       spblas::csr_view<value_t, index_t, offset_t> d_c(
           nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
@@ -147,8 +147,10 @@ TEST(thrust_CsrView, SpGEMMReuse_AScaled) {
       spblas::spgemm_state_t state;
       spblas::multiply_symbolic_compute(state, scaled(alpha, d_a), d_b, d_c);
       auto nnz = state.result_nnz();
-      thrust::device_vector<value_t> d_c_values(nnz);
-      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::vector<value_t> c_values(nnz);
+      std::vector<index_t> c_colind(nnz);
+      thrust::device_vector<value_t> d_c_values(c_values);
+      thrust::device_vector<index_t> d_c_colind(c_colind);
       std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
       std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
       std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
@@ -173,9 +175,6 @@ TEST(thrust_CsrView, SpGEMMReuse_AScaled) {
           thrust::copy(b_values.begin(), b_values.end(), d_b_values.begin());
         }
         spblas::multiply_numeric(state, scaled(alpha, d_a), d_b, d_c);
-        std::vector<value_t> c_values(nnz);
-        std::vector<offset_t> c_rowptr(m + 1);
-        std::vector<index_t> c_colind(nnz);
         thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin());
         thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
         thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
@@ -243,8 +242,8 @@ TEST(thrust_CsrView, SpGEMMReuse_BScaled) {
           d_b_colind.data().get(), b_shape, b_nnz);
       spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
                                                      b_colind, b_shape, b_nnz);
-
-      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+      std::vector<offset_t> c_rowptr(m + 1);
+      thrust::device_vector<offset_t> d_c_rowptr(c_rowptr);
 
       spblas::csr_view<value_t, index_t, offset_t> d_c(
           nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
@@ -252,8 +251,10 @@ TEST(thrust_CsrView, SpGEMMReuse_BScaled) {
       spblas::spgemm_state_t state;
       spblas::multiply_symbolic_compute(state, d_a, scaled(alpha, d_b), d_c);
       auto nnz = state.result_nnz();
-      thrust::device_vector<value_t> d_c_values(nnz);
-      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::vector<value_t> c_values(nnz);
+      std::vector<index_t> c_colind(nnz);
+      thrust::device_vector<value_t> d_c_values(c_values);
+      thrust::device_vector<index_t> d_c_colind(c_colind);
       std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
       std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
       std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
@@ -278,9 +279,6 @@ TEST(thrust_CsrView, SpGEMMReuse_BScaled) {
           thrust::copy(b_values.begin(), b_values.end(), d_b_values.begin());
         }
         spblas::multiply_numeric(state, d_a, scaled(alpha, d_b), d_c);
-        std::vector<value_t> c_values(nnz);
-        std::vector<offset_t> c_rowptr(m + 1);
-        std::vector<index_t> c_colind(nnz);
         thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin());
         thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
         thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
@@ -348,7 +346,8 @@ TEST(thrust_CsrView, SpGEMMReuseAndChangePointer) {
       spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
                                                      b_colind, b_shape, b_nnz);
 
-      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+      std::vector<offset_t> c_rowptr(m + 1);
+      thrust::device_vector<offset_t> d_c_rowptr(c_rowptr);
 
       spblas::csr_view<value_t, index_t, offset_t> d_c(
           nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
@@ -356,8 +355,10 @@ TEST(thrust_CsrView, SpGEMMReuseAndChangePointer) {
       spblas::spgemm_state_t state;
       spblas::multiply_symbolic_compute(state, d_a, d_b, d_c);
       auto nnz = state.result_nnz();
-      thrust::device_vector<value_t> d_c_values(nnz);
-      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::vector<value_t> c_values(nnz);
+      std::vector<index_t> c_colind(nnz);
+      thrust::device_vector<value_t> d_c_values(c_values);
+      thrust::device_vector<index_t> d_c_colind(c_colind);
       std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
       std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
       std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
@@ -365,6 +366,9 @@ TEST(thrust_CsrView, SpGEMMReuseAndChangePointer) {
                  nnz);
 
       spblas::multiply_symbolic_fill(state, d_a, d_b, d_c);
+      // move the sparsity back to host for later copy
+      thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
+      thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
       std::mt19937 g(0);
       for (int i = 0; i < 3; i++) {
         // regenerate value of a and b;
@@ -376,16 +380,17 @@ TEST(thrust_CsrView, SpGEMMReuseAndChangePointer) {
           v = val_dist(g);
         }
         // create different pointers than the symbolic phase, but they still
-        // hold the same sparsity
+        // hold the same sparsity.
+        // note. cuda without nvcc can only copy from host to device
         thrust::device_vector<value_t> d_a_values_new(a_values);
-        thrust::device_vector<index_t> d_a_colind_new(d_a_colind);
-        thrust::device_vector<index_t> d_a_rowptr_new(d_a_rowptr);
+        thrust::device_vector<index_t> d_a_colind_new(a_colind);
+        thrust::device_vector<index_t> d_a_rowptr_new(a_rowptr);
         thrust::device_vector<value_t> d_b_values_new(b_values);
-        thrust::device_vector<index_t> d_b_colind_new(d_b_colind);
-        thrust::device_vector<index_t> d_b_rowptr_new(d_b_rowptr);
-        thrust::device_vector<value_t> d_c_values_new(d_c_values);
-        thrust::device_vector<index_t> d_c_colind_new(d_c_colind);
-        thrust::device_vector<index_t> d_c_rowptr_new(d_c_rowptr);
+        thrust::device_vector<index_t> d_b_colind_new(b_colind);
+        thrust::device_vector<index_t> d_b_rowptr_new(b_rowptr);
+        thrust::device_vector<value_t> d_c_values_new(c_values);
+        thrust::device_vector<index_t> d_c_colind_new(c_colind);
+        thrust::device_vector<index_t> d_c_rowptr_new(c_rowptr);
         spblas::csr_view<value_t, index_t, offset_t> d_a(
             d_a_values_new.data().get(), d_a_rowptr_new.data().get(),
             d_a_colind_new.data().get(), a_shape, a_nnz);
@@ -398,9 +403,6 @@ TEST(thrust_CsrView, SpGEMMReuseAndChangePointer) {
         // call numeric on new data
         spblas::multiply_numeric(state, d_a, d_b, d_c);
         // move c back to host memory
-        std::vector<value_t> c_values(nnz);
-        std::vector<offset_t> c_rowptr(m + 1);
-        std::vector<index_t> c_colind(nnz);
         thrust::copy(d_c_values_new.begin(), d_c_values_new.end(),
                      c_values.begin());
         thrust::copy(d_c_rowptr_new.begin(), d_c_rowptr_new.end(),
diff --git a/test/gtest/device/spgemm_test.cpp b/test/gtest/device/spgemm_test.cpp
index 98e1aed..03d5d1f 100644
--- a/test/gtest/device/spgemm_test.cpp
+++ b/test/gtest/device/spgemm_test.cpp
@@ -33,8 +33,10 @@ TEST(thrust_CsrView, SpGEMM) {
           d_b_colind.data().get(), b_shape, b_nnz);
       spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
                                                      b_colind, b_shape, b_nnz);
-
-      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+      std::vector<offset_t> c_rowptr(m + 1);
+      // Can not create the vector with the size because it will use for_each to
+      // initilize the value in cuda.
+      thrust::device_vector<offset_t> d_c_rowptr(c_rowptr);
 
       spblas::csr_view<value_t, index_t, offset_t> d_c(
           nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
@@ -42,8 +44,10 @@ TEST(thrust_CsrView, SpGEMM) {
       spblas::spgemm_state_t state;
       spblas::multiply_compute(state, d_a, d_b, d_c);
       auto nnz = state.result_nnz();
-      thrust::device_vector<value_t> d_c_values(nnz);
-      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::vector<value_t> c_values(nnz);
+      std::vector<index_t> c_colind(nnz);
+      thrust::device_vector<value_t> d_c_values(c_values);
+      thrust::device_vector<index_t> d_c_colind(c_colind);
       std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
       std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
       std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
@@ -52,9 +56,6 @@ TEST(thrust_CsrView, SpGEMM) {
 
       spblas::multiply_fill(state, d_a, d_b, d_c);
 
-      std::vector<value_t> c_values(nnz);
-      std::vector<offset_t> c_rowptr(m + 1);
-      std::vector<index_t> c_colind(nnz);
       thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin());
       thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
       thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
@@ -121,8 +122,8 @@ TEST(thrust_CsrView, SpGEMM_AScaled) {
           d_b_colind.data().get(), b_shape, b_nnz);
       spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
                                                      b_colind, b_shape, b_nnz);
-
-      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+      std::vector<offset_t> c_rowptr(m + 1);
+      thrust::device_vector<offset_t> d_c_rowptr(c_rowptr);
 
       spblas::csr_view<value_t, index_t, offset_t> d_c(
           nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
@@ -130,8 +131,10 @@ TEST(thrust_CsrView, SpGEMM_AScaled) {
       spblas::spgemm_state_t state;
       spblas::multiply_compute(state, spblas::scaled(alpha, d_a), d_b, d_c);
       auto nnz = state.result_nnz();
-      thrust::device_vector<value_t> d_c_values(nnz);
-      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::vector<value_t> c_values(nnz);
+      std::vector<index_t> c_colind(nnz);
+      thrust::device_vector<value_t> d_c_values(c_values);
+      thrust::device_vector<index_t> d_c_colind(c_colind);
       std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
       std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
       std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
@@ -140,9 +143,6 @@ TEST(thrust_CsrView, SpGEMM_AScaled) {
 
       spblas::multiply_fill(state, spblas::scaled(alpha, d_a), d_b, d_c);
 
-      std::vector<value_t> c_values(nnz);
-      std::vector<offset_t> c_rowptr(m + 1);
-      std::vector<index_t> c_colind(nnz);
       thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin());
       thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
       thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
@@ -210,7 +210,8 @@ TEST(thrust_CsrView, SpGEMM_BScaled) {
       spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
                                                      b_colind, b_shape, b_nnz);
 
-      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+      std::vector<offset_t> c_rowptr(m + 1);
+      thrust::device_vector<offset_t> d_c_rowptr(c_rowptr);
 
       spblas::csr_view<value_t, index_t, offset_t> d_c(
           nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
@@ -218,8 +219,10 @@ TEST(thrust_CsrView, SpGEMM_BScaled) {
       spblas::spgemm_state_t state;
       spblas::multiply_compute(state, d_a, spblas::scaled(alpha, d_b), d_c);
       auto nnz = state.result_nnz();
-      thrust::device_vector<value_t> d_c_values(nnz);
-      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::vector<value_t> c_values(nnz);
+      std::vector<index_t> c_colind(nnz);
+      thrust::device_vector<value_t> d_c_values(c_values);
+      thrust::device_vector<index_t> d_c_colind(c_colind);
       std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
       std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
       std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
@@ -228,9 +231,6 @@ TEST(thrust_CsrView, SpGEMM_BScaled) {
 
       spblas::multiply_fill(state, d_a, spblas::scaled(alpha, d_b), d_c);
 
-      std::vector<value_t> c_values(nnz);
-      std::vector<offset_t> c_rowptr(m + 1);
-      std::vector<index_t> c_colind(nnz);
       thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin());
       thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
       thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());