From 1d795753f889787c6e7f7bb46bbf1f96018ec88e Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Fri, 13 Apr 2018 17:57:56 +0200 Subject: [PATCH 1/4] added benchmark case for FFT convolutions --- CMakeLists.txt | 39 ++++++++++++++++++++++---- tests/CMakeLists.txt | 19 ++++++++++++- tests/bench_gpu_convolve.cu | 56 +++++++++++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+), 6 deletions(-) create mode 100644 tests/bench_gpu_convolve.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index 8617902..d57d76c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,14 +4,36 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.8) # project name PROJECT(FourierConvolutionCUDALib CXX C) if(NOT(${CMAKE_VERSION} VERSION_LESS "3.0.0")) -cmake_policy(SET CMP0042 NEW) + cmake_policy(SET CMP0042 NEW) endif() if(${CMAKE_VERSION} VERSION_GREATER "3.1") -cmake_policy(SET CMP0054 NEW) + cmake_policy(SET CMP0054 NEW) endif() +set(CMAKE_CXX_STANDARD 03) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + option(ENABLE_TESTING "enable the test suite (requires boost to be installed" ON) +option(ENABLE_BENCHMARKS "enable the benchmark suite (requires google/benchmark to be installed" OFF) +# option(ENABLE_CXX11_ABI "enable _GLIBCXX_USE_CXX11_ABI in GCC 5.0+" ON) +# if(${WITH_CXX11_ABI}) +# set(CXX11_ABI_VALUE 1) +# else() +# set(CXX11_ABI_VALUE 0) +# endif() + + +# IF(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_GNUCC OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") +# if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.0") +# set(WITH_CXX11_ABI ON) +# set(CXX11_ABI_VALUE 1) +# endif() + +# add_definitions(-D_GLIBCXX_USE_CXX11_ABI=${CXX11_ABI_VALUE}) +# message(">> [${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}] adding -D_GLIBCXX_USE_CXX11_ABI=${CXX11_ABI_VALUE}") +# endif() + # version number SET (FOURIERCONVOLUTIONCUDALIB_NAME "CUDA FOURIER CONVOLUTION LIBRARY") SET (FOURIERCONVOLUTIONCUDALIB_CODENAME "${PROJECT_NAME}") @@ -75,7 +97,7 @@ IF(INCLUDE_CUDA) FIND_PACKAGE(CUDA) IF(CUDA_FOUND) SET(CUDA_VERBOSE_BUILD ON) - set(CUDA_ARCHS 10;20;30;35;37;50;52;60;61;70) + #set(CUDA_ARCHS 10;20;30;35;37;50;52;60;61;70) SET(CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}") IF(APPLE) @@ -92,6 +114,7 @@ IF(INCLUDE_CUDA) ENDIF() ENDIF(APPLE) + if(NOT DEFINED SMS) set(CUDA_ARCHS 10;20;21) IF("${CUDA_VERSION}" VERSION_GREATER "4.5") @@ -118,7 +141,9 @@ IF(INCLUDE_CUDA) IF("${CUDA_VERSION}" VERSION_GREATER "8.0") list(APPEND CUDA_ARCHS 70)#8.0+ ENDIF() - + else() + set(CUDA_ARCHS ${SMS}) + endif() list(SORT CUDA_ARCHS) @@ -133,7 +158,11 @@ IF(INCLUDE_CUDA) list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_${CUDA_HIGHEST_SM},code=compute_${CUDA_HIGHEST_SM}") - MESSAGE(">> CUDA version ${CUDA_VERSION} detected, compiling for Compute Capability/ies ${CUDA_ARCHS} (highest SM: ${CUDA_HIGHEST_SM})") + MESSAGE(">> CUDA version ${CUDA_VERSION} detected, compiling for Compute Capability/ies ${CUDA_ARCHS} (highest SM: ${CUDA_HIGHEST_SM})") + + # if(WITH_CXX11_ABI) + # list(APPEND CUDA_NVCC_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=${CXX11_ABI_VALUE}") + # endif() set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE};-O2;--use_fast_math) set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG};-g;-G) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b8047c0..e7ea31f 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,6 +1,6 @@ INCLUDE_DIRECTORIES(.) -FIND_PACKAGE (Boost 1.42 QUIET COMPONENTS # system filesystem +FIND_PACKAGE (Boost 1.42 QUIET COMPONENTS system filesystem unit_test_framework REQUIRED) IF(Boost_FOUND) INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIRS}) @@ -9,6 +9,7 @@ ENDIF() FIND_PACKAGE(CUDA) + IF(CUDA_FOUND) INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/src) LINK_DIRECTORIES(${PROJECT_BINARY_DIR}/src) @@ -17,6 +18,7 @@ CUDA_ADD_EXECUTABLE(test_gpu_convolve test_gpu_convolve.cpp image_stack_utils.cp CUDA_ADD_EXECUTABLE(test_gpu_numerical_stability test_gpu_numerical_stability.cpp image_stack_utils.cpp) CUDA_ADD_EXECUTABLE(test_how_cufft_works test_how_cufft_works.cu image_stack_utils.cpp) + IF(Boost_FOUND) MESSAGE(">> Boost UTF: ${Boost_LIBRARIES} ") @@ -32,6 +34,21 @@ IF(Boost_FOUND) ENDIF(Boost_FOUND) +find_package(benchmark) +find_package(Threads) + +if(benchmark_FOUND) + message(STATUS "found benchmark package") + CUDA_ADD_EXECUTABLE(bench_gpu_convolve bench_gpu_convolve.cu) + target_link_libraries(bench_gpu_convolve benchmark::benchmark ${PROJECT_NAME} -pthread stdc++) + target_include_directories(bench_gpu_convolve PRIVATE "${benchmark_DIR}/../../../include") + target_compile_options(bench_gpu_convolve PRIVATE -pthread) + CUDA_ADD_CUFFT_TO_TARGET( bench_gpu_convolve ) + +else() + message(STATUS "benchmark package not found") +endif() + ELSE(CUDA_FOUND) MESSAGE(WARNING "Skipping GPU based tests, CUDA not found\!") ENDIF(CUDA_FOUND) diff --git a/tests/bench_gpu_convolve.cu b/tests/bench_gpu_convolve.cu new file mode 100644 index 0000000..cf9c58b --- /dev/null +++ b/tests/bench_gpu_convolve.cu @@ -0,0 +1,56 @@ + +#include "benchmark/benchmark.h" + +#include "test_fixtures.hpp" +#include "padd_utils.h" + +#include "convolution3Dfft.h" +#include "test_utils.hpp" +#include "image_stack_utils.h" +#include "traits.hpp" + +#include + +namespace fc = fourierconvolution; + + +static void BM_simple_fp32(benchmark::State& state) { + + fc::default_3D_fixture fix; + + std::vector image_dims(3,64); + std::size_t image_len = std::pow(64,3); + std::vector image(image_len,0.); + + std::vector kernel_dims(3,3); + std::size_t kernel_len = std::pow(3,3); + std::vector kernel(kernel_len,0); + + while (state.KeepRunning()){ + + convolution3DfftCUDAInPlace(&image[0], &image_dims[0] , + &kernel[0], &kernel_dims[0] , + selectDeviceWithHighestComputeCapability()); + } + +} + +BENCHMARK(BM_simple_fp32); +BENCHMARK_MAIN(); + +// BOOST_FIXTURE_TEST_SUITE(legacy_convolution, +// fc::default_3D_fixture) + +// BOOST_AUTO_TEST_CASE(trivial_convolve) { + +// float* image = image_.data(); +// std::vector kernel(kernel_size_,0); + +// convolution3DfftCUDAInPlace(image, &image_dims_[0], +// &kernel[0], &kernel_dims_[0], +// selectDeviceWithHighestComputeCapability()); + +// float sum = std::accumulate(image, image + image_size_, 0.f); +// BOOST_CHECK_CLOSE(sum, 0.f, .00001); + +// } From f01795ce91499a3ddb0f7180fa4ee518a8b3a468 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Fri, 13 Apr 2018 18:13:27 +0200 Subject: [PATCH 2/4] ditched libbenchmark due to linking problems with this C++2003 project --- tests/CMakeLists.txt | 27 ++++++++------------------- tests/bench_gpu_convolve.cu | 36 ++++++++---------------------------- 2 files changed, 16 insertions(+), 47 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e7ea31f..79d96dc 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,6 +1,6 @@ INCLUDE_DIRECTORIES(.) -FIND_PACKAGE (Boost 1.42 QUIET COMPONENTS system filesystem +FIND_PACKAGE (Boost 1.42 QUIET COMPONENTS system filesystem timer unit_test_framework REQUIRED) IF(Boost_FOUND) INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIRS}) @@ -21,33 +21,22 @@ CUDA_ADD_EXECUTABLE(test_how_cufft_works test_how_cufft_works.cu image_stack_uti IF(Boost_FOUND) - MESSAGE(">> Boost UTF: ${Boost_LIBRARIES} ") - target_link_libraries(test_gpu_convolve ${Boost_LIBRARIES} ${PROJECT_NAME}) + MESSAGE(">> Boost UTF: ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY} ") + target_link_libraries(test_gpu_convolve ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY} ${PROJECT_NAME}) set_target_properties(test_gpu_convolve PROPERTIES COMPILE_FLAGS "-DBOOST_TEST_DYN_LINK") - target_link_libraries(test_gpu_numerical_stability ${Boost_LIBRARIES} ${PROJECT_NAME}) + target_link_libraries(test_gpu_numerical_stability ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY} ${PROJECT_NAME}) set_target_properties(test_gpu_numerical_stability PROPERTIES COMPILE_FLAGS "-DBOOST_TEST_DYN_LINK") - target_link_libraries(test_how_cufft_works ${Boost_LIBRARIES} ) + target_link_libraries(test_how_cufft_works ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY} ) CUDA_ADD_CUFFT_TO_TARGET( test_how_cufft_works ) ENDIF(Boost_FOUND) -find_package(benchmark) -find_package(Threads) - -if(benchmark_FOUND) - message(STATUS "found benchmark package") - CUDA_ADD_EXECUTABLE(bench_gpu_convolve bench_gpu_convolve.cu) - target_link_libraries(bench_gpu_convolve benchmark::benchmark ${PROJECT_NAME} -pthread stdc++) - target_include_directories(bench_gpu_convolve PRIVATE "${benchmark_DIR}/../../../include") - target_compile_options(bench_gpu_convolve PRIVATE -pthread) - CUDA_ADD_CUFFT_TO_TARGET( bench_gpu_convolve ) - -else() - message(STATUS "benchmark package not found") -endif() +message(STATUS "found benchmark package") +CUDA_ADD_EXECUTABLE(bench_gpu_convolve bench_gpu_convolve.cu) +target_link_libraries(bench_gpu_convolve ${Boost_TIMER_LIBRARY} ${PROJECT_NAME}) ELSE(CUDA_FOUND) MESSAGE(WARNING "Skipping GPU based tests, CUDA not found\!") diff --git a/tests/bench_gpu_convolve.cu b/tests/bench_gpu_convolve.cu index cf9c58b..51c979e 100644 --- a/tests/bench_gpu_convolve.cu +++ b/tests/bench_gpu_convolve.cu @@ -1,7 +1,5 @@ +#include -#include "benchmark/benchmark.h" - -#include "test_fixtures.hpp" #include "padd_utils.h" #include "convolution3Dfft.h" @@ -10,13 +8,12 @@ #include "traits.hpp" #include +#include -namespace fc = fourierconvolution; - +using namespace boost::timer; -static void BM_simple_fp32(benchmark::State& state) { +int main(int argc, char** argv) { - fc::default_3D_fixture fix; std::vector image_dims(3,64); std::size_t image_len = std::pow(64,3); @@ -26,31 +23,14 @@ static void BM_simple_fp32(benchmark::State& state) { std::size_t kernel_len = std::pow(3,3); std::vector kernel(kernel_len,0); - while (state.KeepRunning()){ + cpu_timer timer; + for (int i = 0;i<10;++i){ convolution3DfftCUDAInPlace(&image[0], &image_dims[0] , &kernel[0], &kernel_dims[0] , selectDeviceWithHighestComputeCapability()); } + std::cout << "inplace, 10x, (image 64**3, kernel 3**3)" << timer.format() << '\n'; -} - -BENCHMARK(BM_simple_fp32); -BENCHMARK_MAIN(); - -// BOOST_FIXTURE_TEST_SUITE(legacy_convolution, -// fc::default_3D_fixture) -// BOOST_AUTO_TEST_CASE(trivial_convolve) { - -// float* image = image_.data(); -// std::vector kernel(kernel_size_,0); - -// convolution3DfftCUDAInPlace(image, &image_dims_[0], -// &kernel[0], &kernel_dims_[0], -// selectDeviceWithHighestComputeCapability()); - -// float sum = std::accumulate(image, image + image_size_, 0.f); -// BOOST_CHECK_CLOSE(sum, 0.f, .00001); - -// } +} From b741e5e4d84157e0e7842892987b49954e552f59 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Fri, 4 May 2018 16:53:08 +0200 Subject: [PATCH 3/4] added minimal benchmarking CLI interface --- tests/CMakeLists.txt | 11 ++++++----- tests/bench_gpu_convolve.cu | 39 +++++++++++++++++++++++++++++-------- 2 files changed, 37 insertions(+), 13 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 79d96dc..4c15f59 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,7 +1,7 @@ INCLUDE_DIRECTORIES(.) -FIND_PACKAGE (Boost 1.42 QUIET COMPONENTS system filesystem timer - unit_test_framework REQUIRED) +FIND_PACKAGE (Boost 1.42 QUIET COMPONENTS system filesystem timer unit_test_framework program_options + REQUIRED) IF(Boost_FOUND) INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIRS}) LINK_DIRECTORIES(${Boost_LIBRARY_DIRS}) @@ -31,12 +31,13 @@ IF(Boost_FOUND) target_link_libraries(test_how_cufft_works ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY} ) CUDA_ADD_CUFFT_TO_TARGET( test_how_cufft_works ) + if(ENABLE_BENCHMARKS) + CUDA_ADD_EXECUTABLE(bench_gpu_convolve bench_gpu_convolve.cu) + target_link_libraries(bench_gpu_convolve ${Boost_TIMER_LIBRARY} ${Boost_PROGRAM_OPTIONS_LIBRARY} ${PROJECT_NAME}) +endif() ENDIF(Boost_FOUND) -message(STATUS "found benchmark package") -CUDA_ADD_EXECUTABLE(bench_gpu_convolve bench_gpu_convolve.cu) -target_link_libraries(bench_gpu_convolve ${Boost_TIMER_LIBRARY} ${PROJECT_NAME}) ELSE(CUDA_FOUND) MESSAGE(WARNING "Skipping GPU based tests, CUDA not found\!") diff --git a/tests/bench_gpu_convolve.cu b/tests/bench_gpu_convolve.cu index 51c979e..1301716 100644 --- a/tests/bench_gpu_convolve.cu +++ b/tests/bench_gpu_convolve.cu @@ -1,4 +1,5 @@ #include +#include #include "padd_utils.h" @@ -11,16 +12,38 @@ #include using namespace boost::timer; +namespace po = boost::program_options; + +int main(int ac, char** av) { + + // Declare the supported options. + po::options_description desc("Allowed options"); + desc.add_options() + ("help", "produce help message") + ("image_size", po::value()->default_value(128), "set the 3D image size, so the image will extent sizexsizexsize") + ("kernel_size", po::value()->default_value(3), "set the kernel size, so the kernel will extent sizexsizexsize") + ("gpu", po::value()->default_value(-1), "gpu device to use, if value=-1, the highest device with highest compute capability is used") + ; + + po::variables_map vm; + po::store(po::parse_command_line(ac, av, desc), vm); + po::notify(vm); + + if (vm.count("help")) { + std::cout << desc << "\n"; + return 1; + } -int main(int argc, char** argv) { - + int device_id = vm["gpu"].as(); + if(device_id < 0) + device_id = selectDeviceWithHighestComputeCapability(); - std::vector image_dims(3,64); - std::size_t image_len = std::pow(64,3); + std::vector image_dims(3,vm["image_size"].as()); + std::size_t image_len = std::pow(vm["image_size"].as(),3); std::vector image(image_len,0.); - std::vector kernel_dims(3,3); - std::size_t kernel_len = std::pow(3,3); + std::vector kernel_dims(3,vm["kernel_size"].as()); + std::size_t kernel_len = std::pow(vm["kernel_size"].as(),3); std::vector kernel(kernel_len,0); cpu_timer timer; @@ -28,9 +51,9 @@ int main(int argc, char** argv) { convolution3DfftCUDAInPlace(&image[0], &image_dims[0] , &kernel[0], &kernel_dims[0] , - selectDeviceWithHighestComputeCapability()); + device_id); } - std::cout << "inplace, 10x, (image 64**3, kernel 3**3)" << timer.format() << '\n'; + std::cout << "[gpu "<< device_id << "] inplace, 10x, (image "<< image_dims.front() <<"**3, kernel "<< kernel_dims.front() <<"**3)" << timer.format() << '\n'; } From cf1ab1d52077bf78b5710314069ae9194e2bff9d Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Fri, 4 May 2018 16:55:10 +0200 Subject: [PATCH 4/4] added docs on benchmark tool --- README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/README.md b/README.md index 707dc4e..e66edd4 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,20 @@ $ cmake -DCMAKE_INSTALL_PREFIX=/directory/of/your/choice -DBOOST_ROOT=/path/to/b Here, ```/path/to/boost/root``` should contain the boost libraries and the boost headers. +Benchmarks +---------- + +The repo contains a small utility (in alpha stage) that can be used to run benchmarks. To enable building it, do: + +``` bash +$ cd repo +$ mkdir build +$ cd build +$ cmake -DENABLE_BENCHMARKS=ON .. +$ make +$ ./tests/bench_gpu_convolve +[gpu 0] inplace, 10x, (image 128**3, kernel 3**3) 1.326021s wall, 1.020000s user + 0.300000s system = 1.320000s CPU (99.5%) +``` How to get Help ===============