diff --git a/CMakeLists.txt b/CMakeLists.txt index 7104450..a39415a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -90,6 +90,7 @@ endif() if (ENABLE_CUSPARSE) set(SPBLAS_GPU_BACKEND ON) find_package(CUDAToolkit REQUIRED) + enable_language(CUDA) target_link_libraries(spblas INTERFACE CUDA::cudart CUDA::cusparse CUDA::cublas) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_CUSPARSE") endif() diff --git a/include/spblas/vendor/cusparse/cusparse.hpp b/include/spblas/vendor/cusparse/cusparse.hpp index 7caa698..014b2ba 100644 --- a/include/spblas/vendor/cusparse/cusparse.hpp +++ b/include/spblas/vendor/cusparse/cusparse.hpp @@ -1,3 +1,4 @@ #pragma once #include "multiply.hpp" +#include "multiply_spgemm.hpp" diff --git a/include/spblas/vendor/cusparse/exception.hpp b/include/spblas/vendor/cusparse/exception.hpp index d90ac30..4be80eb 100644 --- a/include/spblas/vendor/cusparse/exception.hpp +++ b/include/spblas/vendor/cusparse/exception.hpp @@ -10,7 +10,7 @@ namespace spblas { namespace __cusparse { // Throw an exception if the cudaError_t is not cudaSuccess. -void throw_if_error(cudaError_t error_code, std::string prefix = "") { +inline void throw_if_error(cudaError_t error_code, std::string prefix = "") { if (error_code == cudaSuccess) { return; } @@ -21,7 +21,7 @@ void throw_if_error(cudaError_t error_code, std::string prefix = "") { } // Throw an exception if the cusparseStatus_t is not CUSPARSE_STATUS_SUCCESS. -void throw_if_error(cusparseStatus_t error_code) { +inline void throw_if_error(cusparseStatus_t error_code) { if (error_code == CUSPARSE_STATUS_SUCCESS) { return; } else if (error_code == CUSPARSE_STATUS_NOT_INITIALIZED) { diff --git a/include/spblas/vendor/cusparse/multiply_spgemm.hpp b/include/spblas/vendor/cusparse/multiply_spgemm.hpp new file mode 100644 index 0000000..d8b9f71 --- /dev/null +++ b/include/spblas/vendor/cusparse/multiply_spgemm.hpp @@ -0,0 +1,379 @@ +#pragma once + +#include +#include +#include + +#include +#include + +#include +#include + +#include "cuda_allocator.hpp" +#include "detail/cusparse_tensors.hpp" +#include "exception.hpp" +#include "types.hpp" + +namespace spblas { + +class spgemm_state_t { +public: + spgemm_state_t() : spgemm_state_t(cusparse::cuda_allocator{}) {} + + spgemm_state_t(cusparse::cuda_allocator alloc) + : alloc_(alloc), buffer_size_1_(0), buffer_size_2_(0), buffer_size_3_(0), + buffer_size_4_(0), buffer_size_5_(0), workspace_1_(nullptr), + workspace_2_(nullptr), workspace_3_(nullptr), workspace_4_(nullptr), + workspace_5_(nullptr), result_nnz_(0), result_shape_(0, 0) { + cusparseHandle_t handle; + __cusparse::throw_if_error(cusparseCreate(&handle)); + if (auto stream = alloc.stream()) { + cusparseSetStream(handle, stream); + } + handle_ = handle_manager(handle, [](cusparseHandle_t handle) { + __cusparse::throw_if_error(cusparseDestroy(handle)); + }); + __cusparse::throw_if_error(cusparseSpGEMM_createDescr(&descr_)); + } + + spgemm_state_t(cusparse::cuda_allocator alloc, cusparseHandle_t handle) + : alloc_(alloc), buffer_size_1_(0), buffer_size_2_(0), + workspace_1_(nullptr), workspace_2_(nullptr), result_nnz_(0), + result_shape_(0, 0) { + handle_ = handle_manager(handle, [](cusparseHandle_t handle) { + // it is provided by user, we do not delete it at all. + }); + __cusparse::throw_if_error(cusparseSpGEMM_createDescr(&descr_)); + } + + ~spgemm_state_t() { + alloc_.deallocate(workspace_1_, buffer_size_1_); + alloc_.deallocate(workspace_2_, buffer_size_2_); + __cusparse::throw_if_error(cusparseDestroySpMat(mat_a_)); + __cusparse::throw_if_error(cusparseDestroySpMat(mat_b_)); + __cusparse::throw_if_error(cusparseDestroySpMat(mat_c_)); + __cusparse::throw_if_error(cusparseSpGEMM_destroyDescr(descr_)); + } + + auto result_shape() { + return this->result_shape_; + } + + auto result_nnz() { + return this->result_nnz_; + } + + template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v + void multiply_compute(A&& a, B&& b, C&& c) { + auto a_base = __detail::get_ultimate_base(a); + auto b_base = __detail::get_ultimate_base(b); + using matrix_type = decltype(a_base); + using input_type = decltype(b_base); + using output_type = std::remove_reference_t; + using value_type = typename matrix_type::scalar_type; + size_t buffer_size = 0; + + auto alpha_optional = __detail::get_scaling_factor(a, b); + value_type alpha = alpha_optional.value_or(1); + value_type beta = 1; + auto handle = this->handle_.get(); + __cusparse::throw_if_error(cusparseDestroySpMat(mat_a_)); + __cusparse::throw_if_error(cusparseDestroySpMat(mat_b_)); + __cusparse::throw_if_error(cusparseDestroySpMat(mat_c_)); + mat_a_ = __cusparse::create_cusparse_handle(a_base); + mat_b_ = __cusparse::create_cusparse_handle(b_base); + mat_c_ = __cusparse::create_cusparse_handle(c); + + // ask bufferSize1 bytes for external memory + size_t buffer_size_1 = 0; + __cusparse::throw_if_error(cusparseSpGEMM_workEstimation( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_, + detail::cuda_data_type_v, CUSPARSE_SPGEMM_DEFAULT, + this->descr_, &buffer_size_1, NULL)); + if (buffer_size_1 > this->buffer_size_1_) { + this->alloc_.deallocate(this->workspace_1_, buffer_size_1_); + this->buffer_size_1_ = buffer_size_1; + this->workspace_1_ = this->alloc_.allocate(buffer_size_1); + } + // inspect the matrices A and B to understand the memory requirement for + // the next step + __cusparse::throw_if_error(cusparseSpGEMM_workEstimation( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_, + detail::cuda_data_type_v, CUSPARSE_SPGEMM_DEFAULT, + this->descr_, &buffer_size_1, this->workspace_1_)); + + // ask buffer_size_2 bytes for external memory + size_t buffer_size_2 = 0; + __cusparse::throw_if_error(cusparseSpGEMM_compute( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_, + detail::cuda_data_type_v, CUSPARSE_SPGEMM_DEFAULT, + this->descr_, &buffer_size_2, NULL)); + if (buffer_size_2 > this->buffer_size_2_) { + this->alloc_.deallocate(this->workspace_2_, buffer_size_2_); + this->buffer_size_2_ = buffer_size_2; + this->workspace_2_ = this->alloc_.allocate(buffer_size_2); + } + + // compute the intermediate product of A * B + cusparseSpGEMM_compute( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_, + detail::cuda_data_type_v, CUSPARSE_SPGEMM_DEFAULT, + this->descr_, &buffer_size_2, this->workspace_2_); + // get matrix C non-zero entries c_nnz + int64_t c_num_rows, c_num_cols, c_nnz; + cusparseSpMatGetSize(mat_c_, &c_num_rows, &c_num_cols, &c_nnz); + this->result_nnz_ = c_nnz; + this->result_shape_ = index(c_num_rows, c_num_cols); + } + + // C = AB + template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v + void multiply_fill(A&& a, B&& b, C&& c) { + auto a_base = __detail::get_ultimate_base(a); + auto b_base = __detail::get_ultimate_base(b); + using matrix_type = decltype(a_base); + using input_type = decltype(b_base); + using output_type = std::remove_reference_t; + using value_type = typename matrix_type::scalar_type; + + auto alpha_optional = __detail::get_scaling_factor(a, b); + value_type alpha = alpha_optional.value_or(1); + value_type beta = 1; + auto handle = this->handle_.get(); + __cusparse::throw_if_error(cusparseCsrSetPointers( + mat_c_, c.rowptr().data(), c.colind().data(), c.values().data())); + + __cusparse::throw_if_error(cusparseSpGEMM_copy( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_, + detail::cuda_data_type_v, CUSPARSE_SPGEMM_DEFAULT, + this->descr_)); + } + + template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v + void multiply_symbolic_compute(A&& a, B&& b, C&& c) { + auto a_base = __detail::get_ultimate_base(a); + auto b_base = __detail::get_ultimate_base(b); + using matrix_type = decltype(a_base); + using input_type = decltype(b_base); + using output_type = std::remove_reference_t; + using value_type = typename matrix_type::scalar_type; + size_t buffer_size = 0; + + auto alpha_optional = __detail::get_scaling_factor(a, b); + value_type alpha = alpha_optional.value_or(1); + value_type beta = 1; + auto handle = this->handle_.get(); + __cusparse::throw_if_error(cusparseDestroySpMat(mat_a_)); + __cusparse::throw_if_error(cusparseDestroySpMat(mat_b_)); + __cusparse::throw_if_error(cusparseDestroySpMat(mat_c_)); + mat_a_ = __cusparse::create_cusparse_handle(a_base); + mat_b_ = __cusparse::create_cusparse_handle(b_base); + mat_c_ = __cusparse::create_cusparse_handle(c); + + // ask bufferSize1 bytes for external memory + size_t buffer_size_1 = 0; + __cusparse::throw_if_error(cusparseSpGEMMreuse_workEstimation( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_, mat_c_, + CUSPARSE_SPGEMM_DEFAULT, this->descr_, &buffer_size_1, NULL)); + if (buffer_size_1 > this->buffer_size_1_) { + this->alloc_.deallocate(this->workspace_1_, buffer_size_1_); + this->buffer_size_1_ = buffer_size_1; + this->workspace_1_ = this->alloc_.allocate(buffer_size_1); + } + // inspect the matrices A and B to understand the memory requirement for + // the next step + __cusparse::throw_if_error(cusparseSpGEMMreuse_workEstimation( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_, mat_c_, + CUSPARSE_SPGEMM_DEFAULT, this->descr_, &buffer_size_1, + this->workspace_1_)); + + // ask buffer_size_2/3/4 bytes for external memory + size_t buffer_size_2 = 0; + size_t buffer_size_3 = 0; + size_t buffer_size_4 = 0; + cusparseSpGEMMreuse_nnz(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_, + mat_c_, CUSPARSE_SPGEMM_DEFAULT, this->descr_, + &buffer_size_2, NULL, &buffer_size_3, NULL, + &buffer_size_4, NULL); + if (buffer_size_2 > this->buffer_size_2_) { + this->alloc_.deallocate(this->workspace_2_, buffer_size_2_); + this->buffer_size_2_ = buffer_size_2; + this->workspace_2_ = this->alloc_.allocate(buffer_size_2); + } + if (buffer_size_3 > this->buffer_size_3_) { + this->alloc_.deallocate(this->workspace_3_, buffer_size_3_); + this->buffer_size_3_ = buffer_size_3; + this->workspace_3_ = this->alloc_.allocate(buffer_size_3); + } + if (buffer_size_4 > this->buffer_size_4_) { + this->alloc_.deallocate(this->workspace_4_, buffer_size_4_); + this->buffer_size_4_ = buffer_size_4; + this->workspace_4_ = this->alloc_.allocate(buffer_size_4); + } + + // compute nnz + cusparseSpGEMMreuse_nnz(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_, + mat_c_, CUSPARSE_SPGEMM_DEFAULT, this->descr_, + &buffer_size_2, this->workspace_2_, &buffer_size_3, + this->workspace_3_, &buffer_size_4, + this->workspace_4_); + // get matrix C non-zero entries c_nnz + int64_t c_num_rows, c_num_cols, c_nnz; + cusparseSpMatGetSize(mat_c_, &c_num_rows, &c_num_cols, &c_nnz); + this->result_nnz_ = c_nnz; + this->result_shape_ = index(c_num_rows, c_num_cols); + } + + template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v + void multiply_symbolic_fill(A&& a, B&& b, C&& c) { + auto a_base = __detail::get_ultimate_base(a); + auto b_base = __detail::get_ultimate_base(b); + using matrix_type = decltype(a_base); + using input_type = decltype(b_base); + using output_type = std::remove_reference_t; + using value_type = typename matrix_type::scalar_type; + + auto alpha_optional = __detail::get_scaling_factor(a, b); + value_type alpha = alpha_optional.value_or(1); + value_type beta = 0; + + __cusparse::throw_if_error(cusparseCsrSetPointers( + this->mat_c_, c.rowptr().data(), c.colind().data(), c.values().data())); + + auto handle = this->handle_.get(); + size_t buffer_size_5 = 0; + cusparseSpGEMMreuse_copy(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_, + mat_c_, CUSPARSE_SPGEMM_DEFAULT, this->descr_, + &buffer_size_5, NULL); + if (buffer_size_5 > this->buffer_size_5_) { + this->alloc_.deallocate(this->workspace_5_, buffer_size_5_); + this->buffer_size_5_ = buffer_size_5; + this->workspace_5_ = this->alloc_.allocate(buffer_size_5); + } + cusparseSpGEMMreuse_copy(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_, + mat_c_, CUSPARSE_SPGEMM_DEFAULT, this->descr_, + &buffer_size_5, this->workspace_5_); + } + + template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v + void multiply_numeric(A&& a, B&& b, C&& c) { + auto a_base = __detail::get_ultimate_base(a); + auto b_base = __detail::get_ultimate_base(b); + using matrix_type = decltype(a_base); + using input_type = decltype(b_base); + using output_type = std::remove_reference_t; + using value_type = typename matrix_type::scalar_type; + + auto alpha_optional = __detail::get_scaling_factor(a, b); + tensor_scalar_t alpha = alpha_optional.value_or(1); + value_type alpha_val = alpha; + value_type beta = 0; + + auto handle = this->handle_.get(); + + // Update the pointer from the matrix but they must contains the same + // sparsity as the previous call. + __cusparse::throw_if_error( + cusparseCsrSetPointers(this->mat_a_, a_base.rowptr().data(), + a_base.colind().data(), a_base.values().data())); + __cusparse::throw_if_error( + cusparseCsrSetPointers(this->mat_b_, b_base.rowptr().data(), + b_base.colind().data(), b_base.values().data())); + __cusparse::throw_if_error(cusparseCsrSetPointers( + this->mat_c_, c.rowptr().data(), c.colind().data(), c.values().data())); + cusparseSpGEMMreuse_compute(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, + mat_a_, mat_b_, &beta, mat_c_, + detail::cuda_data_type_v, + CUSPARSE_SPGEMM_DEFAULT, this->descr_); + } + +private: + using handle_manager = + std::unique_ptr::element_type, + std::function>; + handle_manager handle_; + cusparse::cuda_allocator alloc_; + size_t buffer_size_1_; + size_t buffer_size_2_; + size_t buffer_size_3_; + size_t buffer_size_4_; + size_t buffer_size_5_; + char* workspace_1_; + char* workspace_2_; + char* workspace_3_; + char* workspace_4_; + char* workspace_5_; + index result_shape_; + index_t result_nnz_; + cusparseSpMatDescr_t mat_a_ = nullptr; + cusparseSpMatDescr_t mat_b_ = nullptr; + cusparseSpMatDescr_t mat_c_ = nullptr; + cusparseSpGEMMDescr_t descr_; +}; + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v +void multiply_inspect(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) {} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v +void multiply_compute(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) { + spgemm_handle.multiply_compute(a, b, c); +} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v +void multiply_fill(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) { + spgemm_handle.multiply_fill(a, b, c); +} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v +void multiply_symbolic_compute(spgemm_state_t& spgemm_handle, A&& a, B&& b, + C&& c) { + spgemm_handle.multiply_symbolic_compute(a, b, c); +} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v +void multiply_symbolic_fill(spgemm_state_t& spgemm_handle, A&& a, B&& b, + C&& c) { + spgemm_handle.multiply_symbolic_fill(a, b, c); +} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v +void multiply_numeric(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) { + spgemm_handle.multiply_numeric(a, b, c); +} + +} // namespace spblas diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt index cd96de1..884f549 100644 --- a/test/gtest/CMakeLists.txt +++ b/test/gtest/CMakeLists.txt @@ -20,7 +20,7 @@ if (SPBLAS_GPU_BACKEND) set(GPUTEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp device/spgemm_reuse_test.cpp device/rocsparse/spgemm_4args_test.cpp) set_source_files_properties(${GPUTEST_SOURCES} PROPERTIES LANGUAGE HIP) else () - set(GPUTEST_SOURCES device/spmv_test.cpp) + set(GPUTEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp device/spgemm_reuse_test.cpp) endif () list(APPEND TEST_SOURCES ${GPUTEST_SOURCES}) endif() diff --git a/test/gtest/device/spgemm_reuse_test.cpp b/test/gtest/device/spgemm_reuse_test.cpp index b798d6e..3a6e339 100644 --- a/test/gtest/device/spgemm_reuse_test.cpp +++ b/test/gtest/device/spgemm_reuse_test.cpp @@ -34,7 +34,8 @@ TEST(thrust_CsrView, SpGEMMReuse) { spblas::csr_view b(b_values, b_rowptr, b_colind, b_shape, b_nnz); - thrust::device_vector d_c_rowptr(m + 1); + std::vector c_rowptr(m + 1); + thrust::device_vector d_c_rowptr(c_rowptr); spblas::csr_view d_c( nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); @@ -42,8 +43,10 @@ TEST(thrust_CsrView, SpGEMMReuse) { spblas::spgemm_state_t state; spblas::multiply_symbolic_compute(state, d_a, d_b, d_c); auto nnz = state.result_nnz(); - thrust::device_vector d_c_values(nnz); - thrust::device_vector d_c_colind(nnz); + std::vector c_values(nnz); + std::vector c_colind(nnz); + thrust::device_vector d_c_values(c_values); + thrust::device_vector d_c_colind(c_colind); std::span d_c_values_span(d_c_values.data().get(), nnz); std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); std::span d_c_colind_span(d_c_colind.data().get(), nnz); @@ -68,9 +71,6 @@ TEST(thrust_CsrView, SpGEMMReuse) { thrust::copy(b_values.begin(), b_values.end(), d_b_values.begin()); } spblas::multiply_numeric(state, d_a, d_b, d_c); - std::vector c_values(nnz); - std::vector c_rowptr(m + 1); - std::vector c_colind(nnz); thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); @@ -138,8 +138,8 @@ TEST(thrust_CsrView, SpGEMMReuse_AScaled) { d_b_colind.data().get(), b_shape, b_nnz); spblas::csr_view b(b_values, b_rowptr, b_colind, b_shape, b_nnz); - - thrust::device_vector d_c_rowptr(m + 1); + std::vector c_rowptr(m + 1); + thrust::device_vector d_c_rowptr(c_rowptr); spblas::csr_view d_c( nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); @@ -147,8 +147,10 @@ TEST(thrust_CsrView, SpGEMMReuse_AScaled) { spblas::spgemm_state_t state; spblas::multiply_symbolic_compute(state, scaled(alpha, d_a), d_b, d_c); auto nnz = state.result_nnz(); - thrust::device_vector d_c_values(nnz); - thrust::device_vector d_c_colind(nnz); + std::vector c_values(nnz); + std::vector c_colind(nnz); + thrust::device_vector d_c_values(c_values); + thrust::device_vector d_c_colind(c_colind); std::span d_c_values_span(d_c_values.data().get(), nnz); std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); std::span d_c_colind_span(d_c_colind.data().get(), nnz); @@ -173,9 +175,6 @@ TEST(thrust_CsrView, SpGEMMReuse_AScaled) { thrust::copy(b_values.begin(), b_values.end(), d_b_values.begin()); } spblas::multiply_numeric(state, scaled(alpha, d_a), d_b, d_c); - std::vector c_values(nnz); - std::vector c_rowptr(m + 1); - std::vector c_colind(nnz); thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); @@ -243,8 +242,8 @@ TEST(thrust_CsrView, SpGEMMReuse_BScaled) { d_b_colind.data().get(), b_shape, b_nnz); spblas::csr_view b(b_values, b_rowptr, b_colind, b_shape, b_nnz); - - thrust::device_vector d_c_rowptr(m + 1); + std::vector c_rowptr(m + 1); + thrust::device_vector d_c_rowptr(c_rowptr); spblas::csr_view d_c( nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); @@ -252,8 +251,10 @@ TEST(thrust_CsrView, SpGEMMReuse_BScaled) { spblas::spgemm_state_t state; spblas::multiply_symbolic_compute(state, d_a, scaled(alpha, d_b), d_c); auto nnz = state.result_nnz(); - thrust::device_vector d_c_values(nnz); - thrust::device_vector d_c_colind(nnz); + std::vector c_values(nnz); + std::vector c_colind(nnz); + thrust::device_vector d_c_values(c_values); + thrust::device_vector d_c_colind(c_colind); std::span d_c_values_span(d_c_values.data().get(), nnz); std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); std::span d_c_colind_span(d_c_colind.data().get(), nnz); @@ -278,9 +279,6 @@ TEST(thrust_CsrView, SpGEMMReuse_BScaled) { thrust::copy(b_values.begin(), b_values.end(), d_b_values.begin()); } spblas::multiply_numeric(state, d_a, scaled(alpha, d_b), d_c); - std::vector c_values(nnz); - std::vector c_rowptr(m + 1); - std::vector c_colind(nnz); thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); @@ -348,7 +346,8 @@ TEST(thrust_CsrView, SpGEMMReuseAndChangePointer) { spblas::csr_view b(b_values, b_rowptr, b_colind, b_shape, b_nnz); - thrust::device_vector d_c_rowptr(m + 1); + std::vector c_rowptr(m + 1); + thrust::device_vector d_c_rowptr(c_rowptr); spblas::csr_view d_c( nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); @@ -356,8 +355,10 @@ TEST(thrust_CsrView, SpGEMMReuseAndChangePointer) { spblas::spgemm_state_t state; spblas::multiply_symbolic_compute(state, d_a, d_b, d_c); auto nnz = state.result_nnz(); - thrust::device_vector d_c_values(nnz); - thrust::device_vector d_c_colind(nnz); + std::vector c_values(nnz); + std::vector c_colind(nnz); + thrust::device_vector d_c_values(c_values); + thrust::device_vector d_c_colind(c_colind); std::span d_c_values_span(d_c_values.data().get(), nnz); std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); std::span d_c_colind_span(d_c_colind.data().get(), nnz); @@ -365,6 +366,9 @@ TEST(thrust_CsrView, SpGEMMReuseAndChangePointer) { nnz); spblas::multiply_symbolic_fill(state, d_a, d_b, d_c); + // move the sparsity back to host for later copy + thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); + thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); std::mt19937 g(0); for (int i = 0; i < 3; i++) { // regenerate value of a and b; @@ -376,16 +380,17 @@ TEST(thrust_CsrView, SpGEMMReuseAndChangePointer) { v = val_dist(g); } // create different pointers than the symbolic phase, but they still - // hold the same sparsity + // hold the same sparsity. + // note. cuda without nvcc can only copy from host to device thrust::device_vector d_a_values_new(a_values); - thrust::device_vector d_a_colind_new(d_a_colind); - thrust::device_vector d_a_rowptr_new(d_a_rowptr); + thrust::device_vector d_a_colind_new(a_colind); + thrust::device_vector d_a_rowptr_new(a_rowptr); thrust::device_vector d_b_values_new(b_values); - thrust::device_vector d_b_colind_new(d_b_colind); - thrust::device_vector d_b_rowptr_new(d_b_rowptr); - thrust::device_vector d_c_values_new(d_c_values); - thrust::device_vector d_c_colind_new(d_c_colind); - thrust::device_vector d_c_rowptr_new(d_c_rowptr); + thrust::device_vector d_b_colind_new(b_colind); + thrust::device_vector d_b_rowptr_new(b_rowptr); + thrust::device_vector d_c_values_new(c_values); + thrust::device_vector d_c_colind_new(c_colind); + thrust::device_vector d_c_rowptr_new(c_rowptr); spblas::csr_view d_a( d_a_values_new.data().get(), d_a_rowptr_new.data().get(), d_a_colind_new.data().get(), a_shape, a_nnz); @@ -398,9 +403,6 @@ TEST(thrust_CsrView, SpGEMMReuseAndChangePointer) { // call numeric on new data spblas::multiply_numeric(state, d_a, d_b, d_c); // move c back to host memory - std::vector c_values(nnz); - std::vector c_rowptr(m + 1); - std::vector c_colind(nnz); thrust::copy(d_c_values_new.begin(), d_c_values_new.end(), c_values.begin()); thrust::copy(d_c_rowptr_new.begin(), d_c_rowptr_new.end(), diff --git a/test/gtest/device/spgemm_test.cpp b/test/gtest/device/spgemm_test.cpp index 98e1aed..03d5d1f 100644 --- a/test/gtest/device/spgemm_test.cpp +++ b/test/gtest/device/spgemm_test.cpp @@ -33,8 +33,10 @@ TEST(thrust_CsrView, SpGEMM) { d_b_colind.data().get(), b_shape, b_nnz); spblas::csr_view b(b_values, b_rowptr, b_colind, b_shape, b_nnz); - - thrust::device_vector d_c_rowptr(m + 1); + std::vector c_rowptr(m + 1); + // Can not create the vector with the size because it will use for_each to + // initilize the value in cuda. + thrust::device_vector d_c_rowptr(c_rowptr); spblas::csr_view d_c( nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); @@ -42,8 +44,10 @@ TEST(thrust_CsrView, SpGEMM) { spblas::spgemm_state_t state; spblas::multiply_compute(state, d_a, d_b, d_c); auto nnz = state.result_nnz(); - thrust::device_vector d_c_values(nnz); - thrust::device_vector d_c_colind(nnz); + std::vector c_values(nnz); + std::vector c_colind(nnz); + thrust::device_vector d_c_values(c_values); + thrust::device_vector d_c_colind(c_colind); std::span d_c_values_span(d_c_values.data().get(), nnz); std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); std::span d_c_colind_span(d_c_colind.data().get(), nnz); @@ -52,9 +56,6 @@ TEST(thrust_CsrView, SpGEMM) { spblas::multiply_fill(state, d_a, d_b, d_c); - std::vector c_values(nnz); - std::vector c_rowptr(m + 1); - std::vector c_colind(nnz); thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); @@ -121,8 +122,8 @@ TEST(thrust_CsrView, SpGEMM_AScaled) { d_b_colind.data().get(), b_shape, b_nnz); spblas::csr_view b(b_values, b_rowptr, b_colind, b_shape, b_nnz); - - thrust::device_vector d_c_rowptr(m + 1); + std::vector c_rowptr(m + 1); + thrust::device_vector d_c_rowptr(c_rowptr); spblas::csr_view d_c( nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); @@ -130,8 +131,10 @@ TEST(thrust_CsrView, SpGEMM_AScaled) { spblas::spgemm_state_t state; spblas::multiply_compute(state, spblas::scaled(alpha, d_a), d_b, d_c); auto nnz = state.result_nnz(); - thrust::device_vector d_c_values(nnz); - thrust::device_vector d_c_colind(nnz); + std::vector c_values(nnz); + std::vector c_colind(nnz); + thrust::device_vector d_c_values(c_values); + thrust::device_vector d_c_colind(c_colind); std::span d_c_values_span(d_c_values.data().get(), nnz); std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); std::span d_c_colind_span(d_c_colind.data().get(), nnz); @@ -140,9 +143,6 @@ TEST(thrust_CsrView, SpGEMM_AScaled) { spblas::multiply_fill(state, spblas::scaled(alpha, d_a), d_b, d_c); - std::vector c_values(nnz); - std::vector c_rowptr(m + 1); - std::vector c_colind(nnz); thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); @@ -210,7 +210,8 @@ TEST(thrust_CsrView, SpGEMM_BScaled) { spblas::csr_view b(b_values, b_rowptr, b_colind, b_shape, b_nnz); - thrust::device_vector d_c_rowptr(m + 1); + std::vector c_rowptr(m + 1); + thrust::device_vector d_c_rowptr(c_rowptr); spblas::csr_view d_c( nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); @@ -218,8 +219,10 @@ TEST(thrust_CsrView, SpGEMM_BScaled) { spblas::spgemm_state_t state; spblas::multiply_compute(state, d_a, spblas::scaled(alpha, d_b), d_c); auto nnz = state.result_nnz(); - thrust::device_vector d_c_values(nnz); - thrust::device_vector d_c_colind(nnz); + std::vector c_values(nnz); + std::vector c_colind(nnz); + thrust::device_vector d_c_values(c_values); + thrust::device_vector d_c_colind(c_colind); std::span d_c_values_span(d_c_values.data().get(), nnz); std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); std::span d_c_colind_span(d_c_colind.data().get(), nnz); @@ -228,9 +231,6 @@ TEST(thrust_CsrView, SpGEMM_BScaled) { spblas::multiply_fill(state, d_a, spblas::scaled(alpha, d_b), d_c); - std::vector c_values(nnz); - std::vector c_rowptr(m + 1); - std::vector c_colind(nnz); thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());