From 3a2e03e3d6a68e9ffdadda206d88eeba93d17d9e Mon Sep 17 00:00:00 2001 From: Zachary Ferguson Date: Mon, 26 Jan 2026 01:28:46 -0500 Subject: [PATCH 1/7] Add optional SIMD support via xsimd - Add cross-platform SIMD in LBVH using the xsimd library --- CMakeLists.txt | 34 ++++++++++++++++------- cmake/recipes/xsimd.cmake | 15 +++++++++++ src/ipc/broad_phase/lbvh.cpp | 52 +++++++++++++++++++++--------------- src/ipc/config.hpp.in | 1 + 4 files changed, 70 insertions(+), 32 deletions(-) create mode 100644 cmake/recipes/xsimd.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 19208d9e4..3a4c23b54 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,6 +75,7 @@ else() endif() option(IPC_TOOLKIT_WITH_CUDA "Enable CUDA CCD" OFF) +option(IPC_TOOLKIT_WITH_SIMD "Enable SIMD" ON) option(IPC_TOOLKIT_WITH_RATIONAL_INTERSECTION "Use rational edge-triangle intersection check" OFF) option(IPC_TOOLKIT_WITH_ROBIN_MAP "Use Tessil's robin-map rather than std maps" ON) option(IPC_TOOLKIT_WITH_ABSEIL "Use Abseil's hash functions" ON) @@ -83,10 +84,8 @@ option(IPC_TOOLKIT_WITH_INEXACT_CCD "Use the original inexact CCD meth option(IPC_TOOLKIT_WITH_PROFILER "Enable performance profiler" OFF) # Advanced options -option(IPC_TOOLKIT_WITH_SIMD "Enable SIMD" OFF) option(IPC_TOOLKIT_WITH_CODE_COVERAGE "Enable coverage reporting" OFF) -mark_as_advanced(IPC_TOOLKIT_WITH_SIMD) # This does not work reliably mark_as_advanced(IPC_TOOLKIT_WITH_CODE_COVERAGE) # This is used in GitHub Actions # Set default minimum C++ standard @@ -112,9 +111,10 @@ include(ipc_toolkit_use_colors) set(CMAKE_POSITION_INDEPENDENT_CODE ON) ################################################################################ -# CUDA +# Verify Options ################################################################################ +# CUDA support if(IPC_TOOLKIT_WITH_CUDA) # If CMAKE_CUDA_ARCHITECTURES was not specified, set it to native. if(DEFINED CMAKE_CUDA_ARCHITECTURES) @@ -129,6 +129,19 @@ if(IPC_TOOLKIT_WITH_CUDA) enable_language(CUDA) endif() +## SIMD support +if(IPC_TOOLKIT_WITH_SIMD) + # Figure out SIMD support + message(STATUS "Testing SIMD capabilities...") + find_package(SIMD) + if (SIMD_CXX_FLAGS) + message(STATUS "SIMD support found: ${SIMD_CXX_FLAGS}") + else() + message(WARNING "SIMD support requested but not found. Continuing without SIMD.") + set(IPC_TOOLKIT_WITH_SIMD OFF CACHE BOOL "Enable SIMD" FORCE) + endif() +endif() + ################################################################################ # IPC Toolkit Library ################################################################################ @@ -247,14 +260,15 @@ target_link_libraries(ipc_toolkit PRIVATE ipc::toolkit::warnings) ## SIMD support if(IPC_TOOLKIT_WITH_SIMD) - # Figure out SIMD support - message(STATUS "Testing SIMD capabilities...") - find_package(SIMD) # Add SIMD flags to compiler flags - message(STATUS "Using SIMD flags: ${SIMD_FLAGS}") - target_compile_options(ipc_toolkit PRIVATE ${SIMD_FLAGS}) -else() - message(STATUS "SIMD support disabled") + target_compile_options(ipc_toolkit PRIVATE ${SIMD_CXX_FLAGS}) + + # Link against cross-platform xsimd library + include(xsimd) + target_link_libraries(ipc_toolkit PRIVATE xsimd::xsimd) + + # Disable vectorization in Eigen since I've found it to cause issues. + target_compile_definitions(Eigen3_Eigen INTERFACE EIGEN_DONT_VECTORIZE) endif() # For MSVC, do not use the min and max macros. diff --git a/cmake/recipes/xsimd.cmake b/cmake/recipes/xsimd.cmake new file mode 100644 index 000000000..cf23784b8 --- /dev/null +++ b/cmake/recipes/xsimd.cmake @@ -0,0 +1,15 @@ +# xsimd (https://github.com/xtensor-stack/xsimd) +# License: BSD-3-Clause +if(TARGET xsimd::xsimd) + return() +endif() + +message(STATUS "Third-party: creating target 'xsimd::xsimd'") + +include(CPM) +CPMAddPackage("gh:xtensor-stack/xsimd#14.0.0") + +add_library(xsimd::xsimd ALIAS xsimd) + +# Folder name for IDE +set_target_properties(xsimd PROPERTIES FOLDER "ThirdParty") diff --git a/src/ipc/broad_phase/lbvh.cpp b/src/ipc/broad_phase/lbvh.cpp index 0383ac9c0..12a1e3a9a 100644 --- a/src/ipc/broad_phase/lbvh.cpp +++ b/src/ipc/broad_phase/lbvh.cpp @@ -9,9 +9,10 @@ #include #include -#ifdef __APPLE__ +#ifdef IPC_TOOLKIT_WITH_SIMD // We utilize SIMD registers to compare 1 Node against 4 Queries simultaneously. -#include +#include +namespace xs = xsimd; #endif using namespace std::placeholders; @@ -448,7 +449,7 @@ namespace { } while (node_idx != LBVH::Node::INVALID_POINTER); // Same as root } -#ifdef __APPLE__ +#ifdef IPC_TOOLKIT_WITH_SIMD // SIMD Traversal // Traverses 4 queries simultaneously using SIMD. template @@ -461,8 +462,8 @@ namespace { { assert(n_queries >= 1 && n_queries <= 4); // Load 4 queries into single registers (Structure of Arrays) - auto make_simd = [&](auto F) -> simd_float4 { - return simd_float4 { + auto make_simd = [&](auto F) -> xs::batch { + return xs::batch { F(0), n_queries > 1 ? F(1) : 0.0f, n_queries > 2 ? F(2) : 0.0f, @@ -470,17 +471,17 @@ namespace { }; }; - const simd_float4 q_min_x = + const auto q_min_x = make_simd([&](int k) { return queries[k].aabb_min.x(); }); - const simd_float4 q_min_y = + const auto q_min_y = make_simd([&](int k) { return queries[k].aabb_min.y(); }); - const simd_float4 q_min_z = + const auto q_min_z = make_simd([&](int k) { return queries[k].aabb_min.z(); }); - const simd_float4 q_max_x = + const auto q_max_x = make_simd([&](int k) { return queries[k].aabb_max.x(); }); - const simd_float4 q_max_y = + const auto q_max_y = make_simd([&](int k) { return queries[k].aabb_max.y(); }); - const simd_float4 q_max_z = + const auto q_max_z = make_simd([&](int k) { return queries[k].aabb_max.z(); }); // Use a fixed-size array as a stack to avoid dynamic allocations @@ -507,7 +508,8 @@ namespace { // 1. Intersect 4 queries at once // (child_l.min <= query.max) && (query.min <= child_l.max) - const simd_int4 intersects_l = (child_l.aabb_min.x() <= q_max_x) + const xs::batch_bool intersects_l = + (child_l.aabb_min.x() <= q_max_x) & (child_l.aabb_min.y() <= q_max_y) & (child_l.aabb_min.z() <= q_max_z) & (q_min_x <= child_l.aabb_max.x()) @@ -516,20 +518,21 @@ namespace { // 2. Intersect 4 queries at once // (child_r.min <= query.max) && (query.min <= child_r.max) - const simd_int4 intersects_r = (child_r.aabb_min.x() <= q_max_x) + const xs::batch_bool intersects_r = + (child_r.aabb_min.x() <= q_max_x) & (child_r.aabb_min.y() <= q_max_y) & (child_r.aabb_min.z() <= q_max_z) & (q_min_x <= child_r.aabb_max.x()) & (q_min_y <= child_r.aabb_max.y()) & (q_min_z <= child_r.aabb_max.z()); - const bool any_intersects_l = simd_any(intersects_l); - const bool any_intersects_r = simd_any(intersects_r); + const bool any_intersects_l = xs::any(intersects_l); + const bool any_intersects_r = xs::any(intersects_r); // Query overlaps a leaf node => report collision if (any_intersects_l && child_l.is_leaf()) { for (int k = 0; k < n_queries; ++k) { - if (intersects_l[k]) { + if (intersects_l.get(k)) { attempt_add_candidate< Candidate, swap_order, triangular>( queries[k], child_l, can_collide, candidates); @@ -538,7 +541,7 @@ namespace { } if (any_intersects_r && child_r.is_leaf()) { for (int k = 0; k < n_queries; ++k) { - if (intersects_r[k]) { + if (intersects_r.get(k)) { attempt_add_candidate< Candidate, swap_order, triangular>( queries[k], child_r, can_collide, candidates); @@ -576,9 +579,12 @@ namespace { const std::function& can_collide, tbb::enumerable_thread_specific>& storage) { -#ifdef __APPLE__ // Only support SIMD on Apple platforms for now - constexpr size_t SIMD_SIZE = use_simd ? 4 : 1; - constexpr size_t GRAIN_SIZE = use_simd ? 16 : 1; +#ifdef IPC_TOOLKIT_WITH_SIMD // Only support SIMD on Apple platforms for now + constexpr size_t SIMD_SIZE = use_simd ? xs::batch::size : 1; + static_assert( + 64 % xs::batch::size == 0, "GRAIN_SIZE must be an integer"); + constexpr size_t GRAIN_SIZE = + use_simd ? (64 / xs::batch::size) : 1; #else constexpr size_t SIMD_SIZE = 1; constexpr size_t GRAIN_SIZE = 1; @@ -595,11 +601,13 @@ namespace { tbb::blocked_range(size_t(0), n_tasks, GRAIN_SIZE), [&](const tbb::blocked_range& r) { auto& local_candidates = storage.local(); +#ifdef IPC_TOOLKIT_WITH_SIMD const size_t actual_end = // Handle tail case std::min(SIMD_SIZE * r.end(), n_source_leaves); +#endif for (size_t i = r.begin(); i < r.end(); ++i) { const size_t idx = SIMD_SIZE * i; -#ifdef __APPLE__ +#ifdef IPC_TOOLKIT_WITH_SIMD if constexpr (use_simd) { assert(actual_end - idx >= 1); traverse_lbvh_simd( @@ -611,7 +619,7 @@ namespace { traverse_lbvh( source[source_leaf_offset + idx], target, can_collide, local_candidates); -#ifdef __APPLE__ +#ifdef IPC_TOOLKIT_WITH_SIMD } #endif } diff --git a/src/ipc/config.hpp.in b/src/ipc/config.hpp.in index 6a0db7187..b47f3d9b8 100644 --- a/src/ipc/config.hpp.in +++ b/src/ipc/config.hpp.in @@ -13,6 +13,7 @@ #cmakedefine IPC_TOOLKIT_WITH_INEXACT_CCD #cmakedefine IPC_TOOLKIT_WITH_RATIONAL_INTERSECTION #cmakedefine IPC_TOOLKIT_WITH_CUDA +#cmakedefine IPC_TOOLKIT_WITH_SIMD #cmakedefine IPC_TOOLKIT_WITH_ROBIN_MAP #cmakedefine IPC_TOOLKIT_WITH_ABSEIL #cmakedefine IPC_TOOLKIT_WITH_FILIB From ef469084be2200320b8f9e1d098e3cbc9a95616b Mon Sep 17 00:00:00 2001 From: Zachary Ferguson Date: Mon, 26 Jan 2026 10:01:18 -0500 Subject: [PATCH 2/7] Generalize SIMD traversal to variable width --- src/ipc/broad_phase/lbvh.cpp | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/src/ipc/broad_phase/lbvh.cpp b/src/ipc/broad_phase/lbvh.cpp index 12a1e3a9a..e81bb6cc9 100644 --- a/src/ipc/broad_phase/lbvh.cpp +++ b/src/ipc/broad_phase/lbvh.cpp @@ -15,6 +15,8 @@ namespace xs = xsimd; #endif +#include + using namespace std::placeholders; namespace ipc { @@ -451,7 +453,7 @@ namespace { #ifdef IPC_TOOLKIT_WITH_SIMD // SIMD Traversal - // Traverses 4 queries simultaneously using SIMD. + // Traverses multiple queries simultaneously using SIMD. template void traverse_lbvh_simd( const LBVH::Node* queries, @@ -460,15 +462,24 @@ namespace { const std::function& can_collide, std::vector& candidates) { - assert(n_queries >= 1 && n_queries <= 4); - // Load 4 queries into single registers (Structure of Arrays) - auto make_simd = [&](auto F) -> xs::batch { - return xs::batch { - F(0), - n_queries > 1 ? F(1) : 0.0f, - n_queries > 2 ? F(2) : 0.0f, - n_queries > 3 ? F(3) : 0.0f, - }; + using batch_t = xs::batch; + assert(n_queries >= 1 && n_queries <= batch_t::size); + + // Load queries into single registers + auto make_simd = [&](auto F) -> batch_t { + // 1. Create a buffer of the correct architecture-dependent size + alignas(xs::default_arch::alignment()) + std::array + buffer; + +#pragma unroll + // 2. Fill the buffer, respecting the actual number of queries + for (size_t i = 0; i < batch_t::size; ++i) { + buffer[i] = (i < n_queries) ? F(static_cast(i)) : 0.0f; + } + + // 3. Load the buffer into the SIMD register + return batch_t::load_aligned(buffer.data()); }; const auto q_min_x = From f98b593d060d79aec6d9ac175ded919f8f986b0d Mon Sep 17 00:00:00 2001 From: Zachary Ferguson Date: Mon, 26 Jan 2026 11:51:34 -0500 Subject: [PATCH 3/7] Fix issue with abs(double) casting to int --- CMakeLists.txt | 4 ++-- src/ipc/utils/eigen_ext.hpp | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3a4c23b54..3820456cb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -267,8 +267,8 @@ if(IPC_TOOLKIT_WITH_SIMD) include(xsimd) target_link_libraries(ipc_toolkit PRIVATE xsimd::xsimd) - # Disable vectorization in Eigen since I've found it to cause issues. - target_compile_definitions(Eigen3_Eigen INTERFACE EIGEN_DONT_VECTORIZE) + # Disable vectorization in Eigen since I've found it to have alignment issues. + target_compile_definitions(Eigen3_Eigen INTERFACE EIGEN_DONT_VECTORIZE=1) endif() # For MSVC, do not use the min and max macros. diff --git a/src/ipc/utils/eigen_ext.hpp b/src/ipc/utils/eigen_ext.hpp index ffdda82eb..1acc2471a 100644 --- a/src/ipc/utils/eigen_ext.hpp +++ b/src/ipc/utils/eigen_ext.hpp @@ -5,6 +5,13 @@ #include +#ifdef EIGEN_DONT_VECTORIZE +// NOTE: Avoid error about abs casting double to int. Eigen does this +// internally but seemingly only if EIGEN_DONT_VECTORIZE is not defined. +// TODO: We should always using std::abs to avoid this issue. +using std::abs; +#endif + namespace Eigen { template using RowRef = Ref>; template using ConstRef = const Ref&; From cd68104f8df0a937a670b5e509723baf685cb493 Mon Sep 17 00:00:00 2001 From: Zachary Ferguson Date: Mon, 26 Jan 2026 12:30:11 -0500 Subject: [PATCH 4/7] Remove SIMD preset from CMake configuration and update lbvh.cpp buffer initialization --- CMakePresets.json | 13 +------------ IPCToolkitOptions.cmake.sample | 2 +- src/ipc/broad_phase/lbvh.cpp | 2 +- 3 files changed, 3 insertions(+), 14 deletions(-) diff --git a/CMakePresets.json b/CMakePresets.json index fee266796..9c283ee56 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -51,16 +51,6 @@ "IPC_TOOLKIT_WITH_CUDA": "ON" } }, - { - "name": "simd", - "inherits": "release", - "displayName": "SIMD Enabled", - "description": "Build with SIMD optimizations", - "binaryDir": "${sourceDir}/build/simd", - "cacheVariables": { - "IPC_TOOLKIT_WITH_SIMD": "ON" - } - }, { "name": "test", "inherits": "debug", @@ -82,7 +72,6 @@ "cacheVariables": { "IPC_TOOLKIT_BUILD_PYTHON": "ON", "IPC_TOOLKIT_BUILD_TESTS": "OFF", - "IPC_TOOLKIT_WITH_SIMD": "OFF", "IPC_TOOLKIT_WITH_CUDA": "OFF" } }, @@ -166,4 +155,4 @@ } } ] -} +} \ No newline at end of file diff --git a/IPCToolkitOptions.cmake.sample b/IPCToolkitOptions.cmake.sample index 8b0621631..6ebb4f7a3 100644 --- a/IPCToolkitOptions.cmake.sample +++ b/IPCToolkitOptions.cmake.sample @@ -31,12 +31,12 @@ # option(IPC_TOOLKIT_BUILD_TESTS "Build unit-tests" ON) # option(IPC_TOOLKIT_BUILD_PYTHON "Build Python bindings" OFF) # option(IPC_TOOLKIT_WITH_CUDA "Enable CUDA CCD" OFF) +# option(IPC_TOOLKIT_WITH_SIMD "Enable SIMD" ON) # option(IPC_TOOLKIT_WITH_RATIONAL_INTERSECTION "Use rational edge-triangle intersection check" OFF) # option(IPC_TOOLKIT_WITH_ROBIN_MAP "Use Tessil's robin-map rather than std maps" ON) # option(IPC_TOOLKIT_WITH_ABSEIL "Use Abseil's hash functions" ON) # option(IPC_TOOLKIT_WITH_FILIB "Use filib for interval arithmetic" ON) # option(IPC_TOOLKIT_WITH_INEXACT_CCD "Use the original inexact CCD method of IPC" OFF) -# option(IPC_TOOLKIT_WITH_SIMD "Enable SIMD" OFF) # option(IPC_TOOLKIT_WITH_CODE_COVERAGE "Enable coverage reporting" OFF) # option(IPC_TOOLKIT_TESTS_CCD_BENCHMARK "Enable CCD benchmark test" ON) # set(IPC_TOOLKIT_TESTS_CCD_BENCHMARK_DIR "" CACHE PATH "Path to the CCD benchmark directory") diff --git a/src/ipc/broad_phase/lbvh.cpp b/src/ipc/broad_phase/lbvh.cpp index e81bb6cc9..8d06128b6 100644 --- a/src/ipc/broad_phase/lbvh.cpp +++ b/src/ipc/broad_phase/lbvh.cpp @@ -470,7 +470,7 @@ namespace { // 1. Create a buffer of the correct architecture-dependent size alignas(xs::default_arch::alignment()) std::array - buffer; + buffer {}; #pragma unroll // 2. Fill the buffer, respecting the actual number of queries From f79d470577e60acad93d4cd9126b5f6905b39262 Mon Sep 17 00:00:00 2001 From: Zachary Ferguson Date: Mon, 26 Jan 2026 12:38:35 -0500 Subject: [PATCH 5/7] Use EIGEN_USING_STD for std::abs to avoid casting issues --- src/ipc/utils/eigen_ext.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ipc/utils/eigen_ext.hpp b/src/ipc/utils/eigen_ext.hpp index 1acc2471a..946398f63 100644 --- a/src/ipc/utils/eigen_ext.hpp +++ b/src/ipc/utils/eigen_ext.hpp @@ -9,7 +9,7 @@ // NOTE: Avoid error about abs casting double to int. Eigen does this // internally but seemingly only if EIGEN_DONT_VECTORIZE is not defined. // TODO: We should always using std::abs to avoid this issue. -using std::abs; +EIGEN_USING_STD(abs); // using std::abs; #endif namespace Eigen { From e5c3bf15c85ae0a1dee731b497a0d0a4cec45634 Mon Sep 17 00:00:00 2001 From: Zachary Ferguson Date: Mon, 26 Jan 2026 13:40:46 -0500 Subject: [PATCH 6/7] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/ipc/broad_phase/lbvh.cpp | 5 +++-- src/ipc/utils/eigen_ext.hpp | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/ipc/broad_phase/lbvh.cpp b/src/ipc/broad_phase/lbvh.cpp index 8d06128b6..439856f6a 100644 --- a/src/ipc/broad_phase/lbvh.cpp +++ b/src/ipc/broad_phase/lbvh.cpp @@ -10,7 +10,8 @@ #include #ifdef IPC_TOOLKIT_WITH_SIMD -// We utilize SIMD registers to compare 1 Node against 4 Queries simultaneously. +// We utilize SIMD registers to compare one node against multiple queries simultaneously, +// with the number of queries determined by xs::batch::size. #include namespace xs = xsimd; #endif @@ -590,7 +591,7 @@ namespace { const std::function& can_collide, tbb::enumerable_thread_specific>& storage) { -#ifdef IPC_TOOLKIT_WITH_SIMD // Only support SIMD on Apple platforms for now +#ifdef IPC_TOOLKIT_WITH_SIMD // Enable SIMD acceleration when available constexpr size_t SIMD_SIZE = use_simd ? xs::batch::size : 1; static_assert( 64 % xs::batch::size == 0, "GRAIN_SIZE must be an integer"); diff --git a/src/ipc/utils/eigen_ext.hpp b/src/ipc/utils/eigen_ext.hpp index 946398f63..a364bac0b 100644 --- a/src/ipc/utils/eigen_ext.hpp +++ b/src/ipc/utils/eigen_ext.hpp @@ -8,7 +8,7 @@ #ifdef EIGEN_DONT_VECTORIZE // NOTE: Avoid error about abs casting double to int. Eigen does this // internally but seemingly only if EIGEN_DONT_VECTORIZE is not defined. -// TODO: We should always using std::abs to avoid this issue. +// TODO: We should always use std::abs to avoid this issue. EIGEN_USING_STD(abs); // using std::abs; #endif From 52eb4cf25254fa18c92424cdbbb33af021c60662 Mon Sep 17 00:00:00 2001 From: Zachary Ferguson Date: Mon, 26 Jan 2026 13:45:38 -0500 Subject: [PATCH 7/7] Refactor comments for clarity in SIMD query intersection logic --- src/ipc/broad_phase/lbvh.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/ipc/broad_phase/lbvh.cpp b/src/ipc/broad_phase/lbvh.cpp index 439856f6a..7f3f5d311 100644 --- a/src/ipc/broad_phase/lbvh.cpp +++ b/src/ipc/broad_phase/lbvh.cpp @@ -10,8 +10,9 @@ #include #ifdef IPC_TOOLKIT_WITH_SIMD -// We utilize SIMD registers to compare one node against multiple queries simultaneously, -// with the number of queries determined by xs::batch::size. +// We utilize SIMD registers to compare one node against multiple queries +// simultaneously, with the number of queries determined by +// xs::batch::size. #include namespace xs = xsimd; #endif @@ -518,7 +519,7 @@ namespace { const LBVH::Node& child_l = lbvh[node.left]; const LBVH::Node& child_r = lbvh[node.right]; - // 1. Intersect 4 queries at once + // 1. Intersect multiple queries at once // (child_l.min <= query.max) && (query.min <= child_l.max) const xs::batch_bool intersects_l = (child_l.aabb_min.x() <= q_max_x) @@ -528,7 +529,7 @@ namespace { & (q_min_y <= child_l.aabb_max.y()) & (q_min_z <= child_l.aabb_max.z()); - // 2. Intersect 4 queries at once + // 2. Intersect multiple queries at once // (child_r.min <= query.max) && (query.min <= child_r.max) const xs::batch_bool intersects_r = (child_r.aabb_min.x() <= q_max_x)