diff --git a/CMakeLists.txt b/CMakeLists.txt index 19208d9e4..3820456cb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,6 +75,7 @@ else() endif() option(IPC_TOOLKIT_WITH_CUDA "Enable CUDA CCD" OFF) +option(IPC_TOOLKIT_WITH_SIMD "Enable SIMD" ON) option(IPC_TOOLKIT_WITH_RATIONAL_INTERSECTION "Use rational edge-triangle intersection check" OFF) option(IPC_TOOLKIT_WITH_ROBIN_MAP "Use Tessil's robin-map rather than std maps" ON) option(IPC_TOOLKIT_WITH_ABSEIL "Use Abseil's hash functions" ON) @@ -83,10 +84,8 @@ option(IPC_TOOLKIT_WITH_INEXACT_CCD "Use the original inexact CCD meth option(IPC_TOOLKIT_WITH_PROFILER "Enable performance profiler" OFF) # Advanced options -option(IPC_TOOLKIT_WITH_SIMD "Enable SIMD" OFF) option(IPC_TOOLKIT_WITH_CODE_COVERAGE "Enable coverage reporting" OFF) -mark_as_advanced(IPC_TOOLKIT_WITH_SIMD) # This does not work reliably mark_as_advanced(IPC_TOOLKIT_WITH_CODE_COVERAGE) # This is used in GitHub Actions # Set default minimum C++ standard @@ -112,9 +111,10 @@ include(ipc_toolkit_use_colors) set(CMAKE_POSITION_INDEPENDENT_CODE ON) ################################################################################ -# CUDA +# Verify Options ################################################################################ +# CUDA support if(IPC_TOOLKIT_WITH_CUDA) # If CMAKE_CUDA_ARCHITECTURES was not specified, set it to native. if(DEFINED CMAKE_CUDA_ARCHITECTURES) @@ -129,6 +129,19 @@ if(IPC_TOOLKIT_WITH_CUDA) enable_language(CUDA) endif() +## SIMD support +if(IPC_TOOLKIT_WITH_SIMD) + # Figure out SIMD support + message(STATUS "Testing SIMD capabilities...") + find_package(SIMD) + if (SIMD_CXX_FLAGS) + message(STATUS "SIMD support found: ${SIMD_CXX_FLAGS}") + else() + message(WARNING "SIMD support requested but not found. Continuing without SIMD.") + set(IPC_TOOLKIT_WITH_SIMD OFF CACHE BOOL "Enable SIMD" FORCE) + endif() +endif() + ################################################################################ # IPC Toolkit Library ################################################################################ @@ -247,14 +260,15 @@ target_link_libraries(ipc_toolkit PRIVATE ipc::toolkit::warnings) ## SIMD support if(IPC_TOOLKIT_WITH_SIMD) - # Figure out SIMD support - message(STATUS "Testing SIMD capabilities...") - find_package(SIMD) # Add SIMD flags to compiler flags - message(STATUS "Using SIMD flags: ${SIMD_FLAGS}") - target_compile_options(ipc_toolkit PRIVATE ${SIMD_FLAGS}) -else() - message(STATUS "SIMD support disabled") + target_compile_options(ipc_toolkit PRIVATE ${SIMD_CXX_FLAGS}) + + # Link against cross-platform xsimd library + include(xsimd) + target_link_libraries(ipc_toolkit PRIVATE xsimd::xsimd) + + # Disable vectorization in Eigen since I've found it to have alignment issues. + target_compile_definitions(Eigen3_Eigen INTERFACE EIGEN_DONT_VECTORIZE=1) endif() # For MSVC, do not use the min and max macros. diff --git a/CMakePresets.json b/CMakePresets.json index fee266796..9c283ee56 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -51,16 +51,6 @@ "IPC_TOOLKIT_WITH_CUDA": "ON" } }, - { - "name": "simd", - "inherits": "release", - "displayName": "SIMD Enabled", - "description": "Build with SIMD optimizations", - "binaryDir": "${sourceDir}/build/simd", - "cacheVariables": { - "IPC_TOOLKIT_WITH_SIMD": "ON" - } - }, { "name": "test", "inherits": "debug", @@ -82,7 +72,6 @@ "cacheVariables": { "IPC_TOOLKIT_BUILD_PYTHON": "ON", "IPC_TOOLKIT_BUILD_TESTS": "OFF", - "IPC_TOOLKIT_WITH_SIMD": "OFF", "IPC_TOOLKIT_WITH_CUDA": "OFF" } }, @@ -166,4 +155,4 @@ } } ] -} +} \ No newline at end of file diff --git a/IPCToolkitOptions.cmake.sample b/IPCToolkitOptions.cmake.sample index 8b0621631..6ebb4f7a3 100644 --- a/IPCToolkitOptions.cmake.sample +++ b/IPCToolkitOptions.cmake.sample @@ -31,12 +31,12 @@ # option(IPC_TOOLKIT_BUILD_TESTS "Build unit-tests" ON) # option(IPC_TOOLKIT_BUILD_PYTHON "Build Python bindings" OFF) # option(IPC_TOOLKIT_WITH_CUDA "Enable CUDA CCD" OFF) +# option(IPC_TOOLKIT_WITH_SIMD "Enable SIMD" ON) # option(IPC_TOOLKIT_WITH_RATIONAL_INTERSECTION "Use rational edge-triangle intersection check" OFF) # option(IPC_TOOLKIT_WITH_ROBIN_MAP "Use Tessil's robin-map rather than std maps" ON) # option(IPC_TOOLKIT_WITH_ABSEIL "Use Abseil's hash functions" ON) # option(IPC_TOOLKIT_WITH_FILIB "Use filib for interval arithmetic" ON) # option(IPC_TOOLKIT_WITH_INEXACT_CCD "Use the original inexact CCD method of IPC" OFF) -# option(IPC_TOOLKIT_WITH_SIMD "Enable SIMD" OFF) # option(IPC_TOOLKIT_WITH_CODE_COVERAGE "Enable coverage reporting" OFF) # option(IPC_TOOLKIT_TESTS_CCD_BENCHMARK "Enable CCD benchmark test" ON) # set(IPC_TOOLKIT_TESTS_CCD_BENCHMARK_DIR "" CACHE PATH "Path to the CCD benchmark directory") diff --git a/cmake/recipes/xsimd.cmake b/cmake/recipes/xsimd.cmake new file mode 100644 index 000000000..cf23784b8 --- /dev/null +++ b/cmake/recipes/xsimd.cmake @@ -0,0 +1,15 @@ +# xsimd (https://github.com/xtensor-stack/xsimd) +# License: BSD-3-Clause +if(TARGET xsimd::xsimd) + return() +endif() + +message(STATUS "Third-party: creating target 'xsimd::xsimd'") + +include(CPM) +CPMAddPackage("gh:xtensor-stack/xsimd#14.0.0") + +add_library(xsimd::xsimd ALIAS xsimd) + +# Folder name for IDE +set_target_properties(xsimd PROPERTIES FOLDER "ThirdParty") diff --git a/src/ipc/broad_phase/lbvh.cpp b/src/ipc/broad_phase/lbvh.cpp index 0383ac9c0..7f3f5d311 100644 --- a/src/ipc/broad_phase/lbvh.cpp +++ b/src/ipc/broad_phase/lbvh.cpp @@ -9,11 +9,16 @@ #include #include -#ifdef __APPLE__ -// We utilize SIMD registers to compare 1 Node against 4 Queries simultaneously. -#include +#ifdef IPC_TOOLKIT_WITH_SIMD +// We utilize SIMD registers to compare one node against multiple queries +// simultaneously, with the number of queries determined by +// xs::batch::size. +#include +namespace xs = xsimd; #endif +#include + using namespace std::placeholders; namespace ipc { @@ -448,9 +453,9 @@ namespace { } while (node_idx != LBVH::Node::INVALID_POINTER); // Same as root } -#ifdef __APPLE__ +#ifdef IPC_TOOLKIT_WITH_SIMD // SIMD Traversal - // Traverses 4 queries simultaneously using SIMD. + // Traverses multiple queries simultaneously using SIMD. template void traverse_lbvh_simd( const LBVH::Node* queries, @@ -459,28 +464,37 @@ namespace { const std::function& can_collide, std::vector& candidates) { - assert(n_queries >= 1 && n_queries <= 4); - // Load 4 queries into single registers (Structure of Arrays) - auto make_simd = [&](auto F) -> simd_float4 { - return simd_float4 { - F(0), - n_queries > 1 ? F(1) : 0.0f, - n_queries > 2 ? F(2) : 0.0f, - n_queries > 3 ? F(3) : 0.0f, - }; + using batch_t = xs::batch; + assert(n_queries >= 1 && n_queries <= batch_t::size); + + // Load queries into single registers + auto make_simd = [&](auto F) -> batch_t { + // 1. Create a buffer of the correct architecture-dependent size + alignas(xs::default_arch::alignment()) + std::array + buffer {}; + +#pragma unroll + // 2. Fill the buffer, respecting the actual number of queries + for (size_t i = 0; i < batch_t::size; ++i) { + buffer[i] = (i < n_queries) ? F(static_cast(i)) : 0.0f; + } + + // 3. Load the buffer into the SIMD register + return batch_t::load_aligned(buffer.data()); }; - const simd_float4 q_min_x = + const auto q_min_x = make_simd([&](int k) { return queries[k].aabb_min.x(); }); - const simd_float4 q_min_y = + const auto q_min_y = make_simd([&](int k) { return queries[k].aabb_min.y(); }); - const simd_float4 q_min_z = + const auto q_min_z = make_simd([&](int k) { return queries[k].aabb_min.z(); }); - const simd_float4 q_max_x = + const auto q_max_x = make_simd([&](int k) { return queries[k].aabb_max.x(); }); - const simd_float4 q_max_y = + const auto q_max_y = make_simd([&](int k) { return queries[k].aabb_max.y(); }); - const simd_float4 q_max_z = + const auto q_max_z = make_simd([&](int k) { return queries[k].aabb_max.z(); }); // Use a fixed-size array as a stack to avoid dynamic allocations @@ -505,31 +519,33 @@ namespace { const LBVH::Node& child_l = lbvh[node.left]; const LBVH::Node& child_r = lbvh[node.right]; - // 1. Intersect 4 queries at once + // 1. Intersect multiple queries at once // (child_l.min <= query.max) && (query.min <= child_l.max) - const simd_int4 intersects_l = (child_l.aabb_min.x() <= q_max_x) + const xs::batch_bool intersects_l = + (child_l.aabb_min.x() <= q_max_x) & (child_l.aabb_min.y() <= q_max_y) & (child_l.aabb_min.z() <= q_max_z) & (q_min_x <= child_l.aabb_max.x()) & (q_min_y <= child_l.aabb_max.y()) & (q_min_z <= child_l.aabb_max.z()); - // 2. Intersect 4 queries at once + // 2. Intersect multiple queries at once // (child_r.min <= query.max) && (query.min <= child_r.max) - const simd_int4 intersects_r = (child_r.aabb_min.x() <= q_max_x) + const xs::batch_bool intersects_r = + (child_r.aabb_min.x() <= q_max_x) & (child_r.aabb_min.y() <= q_max_y) & (child_r.aabb_min.z() <= q_max_z) & (q_min_x <= child_r.aabb_max.x()) & (q_min_y <= child_r.aabb_max.y()) & (q_min_z <= child_r.aabb_max.z()); - const bool any_intersects_l = simd_any(intersects_l); - const bool any_intersects_r = simd_any(intersects_r); + const bool any_intersects_l = xs::any(intersects_l); + const bool any_intersects_r = xs::any(intersects_r); // Query overlaps a leaf node => report collision if (any_intersects_l && child_l.is_leaf()) { for (int k = 0; k < n_queries; ++k) { - if (intersects_l[k]) { + if (intersects_l.get(k)) { attempt_add_candidate< Candidate, swap_order, triangular>( queries[k], child_l, can_collide, candidates); @@ -538,7 +554,7 @@ namespace { } if (any_intersects_r && child_r.is_leaf()) { for (int k = 0; k < n_queries; ++k) { - if (intersects_r[k]) { + if (intersects_r.get(k)) { attempt_add_candidate< Candidate, swap_order, triangular>( queries[k], child_r, can_collide, candidates); @@ -576,9 +592,12 @@ namespace { const std::function& can_collide, tbb::enumerable_thread_specific>& storage) { -#ifdef __APPLE__ // Only support SIMD on Apple platforms for now - constexpr size_t SIMD_SIZE = use_simd ? 4 : 1; - constexpr size_t GRAIN_SIZE = use_simd ? 16 : 1; +#ifdef IPC_TOOLKIT_WITH_SIMD // Enable SIMD acceleration when available + constexpr size_t SIMD_SIZE = use_simd ? xs::batch::size : 1; + static_assert( + 64 % xs::batch::size == 0, "GRAIN_SIZE must be an integer"); + constexpr size_t GRAIN_SIZE = + use_simd ? (64 / xs::batch::size) : 1; #else constexpr size_t SIMD_SIZE = 1; constexpr size_t GRAIN_SIZE = 1; @@ -595,11 +614,13 @@ namespace { tbb::blocked_range(size_t(0), n_tasks, GRAIN_SIZE), [&](const tbb::blocked_range& r) { auto& local_candidates = storage.local(); +#ifdef IPC_TOOLKIT_WITH_SIMD const size_t actual_end = // Handle tail case std::min(SIMD_SIZE * r.end(), n_source_leaves); +#endif for (size_t i = r.begin(); i < r.end(); ++i) { const size_t idx = SIMD_SIZE * i; -#ifdef __APPLE__ +#ifdef IPC_TOOLKIT_WITH_SIMD if constexpr (use_simd) { assert(actual_end - idx >= 1); traverse_lbvh_simd( @@ -611,7 +632,7 @@ namespace { traverse_lbvh( source[source_leaf_offset + idx], target, can_collide, local_candidates); -#ifdef __APPLE__ +#ifdef IPC_TOOLKIT_WITH_SIMD } #endif } diff --git a/src/ipc/config.hpp.in b/src/ipc/config.hpp.in index 6a0db7187..b47f3d9b8 100644 --- a/src/ipc/config.hpp.in +++ b/src/ipc/config.hpp.in @@ -13,6 +13,7 @@ #cmakedefine IPC_TOOLKIT_WITH_INEXACT_CCD #cmakedefine IPC_TOOLKIT_WITH_RATIONAL_INTERSECTION #cmakedefine IPC_TOOLKIT_WITH_CUDA +#cmakedefine IPC_TOOLKIT_WITH_SIMD #cmakedefine IPC_TOOLKIT_WITH_ROBIN_MAP #cmakedefine IPC_TOOLKIT_WITH_ABSEIL #cmakedefine IPC_TOOLKIT_WITH_FILIB diff --git a/src/ipc/utils/eigen_ext.hpp b/src/ipc/utils/eigen_ext.hpp index ffdda82eb..a364bac0b 100644 --- a/src/ipc/utils/eigen_ext.hpp +++ b/src/ipc/utils/eigen_ext.hpp @@ -5,6 +5,13 @@ #include +#ifdef EIGEN_DONT_VECTORIZE +// NOTE: Avoid error about abs casting double to int. Eigen does this +// internally but seemingly only if EIGEN_DONT_VECTORIZE is not defined. +// TODO: We should always use std::abs to avoid this issue. +EIGEN_USING_STD(abs); // using std::abs; +#endif + namespace Eigen { template using RowRef = Ref>; template using ConstRef = const Ref&;