From 3a2e03e3d6a68e9ffdadda206d88eeba93d17d9e Mon Sep 17 00:00:00 2001
From: Zachary Ferguson <zy.fergus@gmail.com>
Date: Mon, 26 Jan 2026 01:28:46 -0500
Subject: [PATCH 1/7] Add optional SIMD support via xsimd

- Add cross-platform SIMD in LBVH using the xsimd library
---
 CMakeLists.txt               | 34 ++++++++++++++++-------
 cmake/recipes/xsimd.cmake    | 15 +++++++++++
 src/ipc/broad_phase/lbvh.cpp | 52 +++++++++++++++++++++---------------
 src/ipc/config.hpp.in        |  1 +
 4 files changed, 70 insertions(+), 32 deletions(-)
 create mode 100644 cmake/recipes/xsimd.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 19208d9e4..3a4c23b54 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -75,6 +75,7 @@ else()
 endif()
 
 option(IPC_TOOLKIT_WITH_CUDA                  "Enable CUDA CCD"                               OFF)
+option(IPC_TOOLKIT_WITH_SIMD                  "Enable SIMD"                                    ON)
 option(IPC_TOOLKIT_WITH_RATIONAL_INTERSECTION "Use rational edge-triangle intersection check" OFF)
 option(IPC_TOOLKIT_WITH_ROBIN_MAP             "Use Tessil's robin-map rather than std maps"    ON)
 option(IPC_TOOLKIT_WITH_ABSEIL                "Use Abseil's hash functions"                    ON)
@@ -83,10 +84,8 @@ option(IPC_TOOLKIT_WITH_INEXACT_CCD           "Use the original inexact CCD meth
 option(IPC_TOOLKIT_WITH_PROFILER              "Enable performance profiler"                   OFF)
 
 # Advanced options
-option(IPC_TOOLKIT_WITH_SIMD                  "Enable SIMD"                                   OFF)
 option(IPC_TOOLKIT_WITH_CODE_COVERAGE         "Enable coverage reporting"                     OFF)
 
-mark_as_advanced(IPC_TOOLKIT_WITH_SIMD)          # This does not work reliably
 mark_as_advanced(IPC_TOOLKIT_WITH_CODE_COVERAGE) # This is used in GitHub Actions
 
 # Set default minimum C++ standard
@@ -112,9 +111,10 @@ include(ipc_toolkit_use_colors)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
 ################################################################################
-# CUDA
+# Verify Options
 ################################################################################
 
+# CUDA support
 if(IPC_TOOLKIT_WITH_CUDA)
   # If CMAKE_CUDA_ARCHITECTURES was not specified, set it to native.
   if(DEFINED CMAKE_CUDA_ARCHITECTURES)
@@ -129,6 +129,19 @@ if(IPC_TOOLKIT_WITH_CUDA)
   enable_language(CUDA)
 endif()
 
+## SIMD support
+if(IPC_TOOLKIT_WITH_SIMD)
+  # Figure out SIMD support
+  message(STATUS "Testing SIMD capabilities...")
+  find_package(SIMD)
+  if (SIMD_CXX_FLAGS)
+    message(STATUS "SIMD support found: ${SIMD_CXX_FLAGS}")
+  else()
+    message(WARNING "SIMD support requested but not found. Continuing without SIMD.")
+    set(IPC_TOOLKIT_WITH_SIMD OFF CACHE BOOL "Enable SIMD" FORCE)
+  endif()
+endif()
+
 ################################################################################
 # IPC Toolkit Library
 ################################################################################
@@ -247,14 +260,15 @@ target_link_libraries(ipc_toolkit PRIVATE ipc::toolkit::warnings)
 
 ## SIMD support
 if(IPC_TOOLKIT_WITH_SIMD)
-  # Figure out SIMD support
-  message(STATUS "Testing SIMD capabilities...")
-  find_package(SIMD)
   # Add SIMD flags to compiler flags
-  message(STATUS "Using SIMD flags: ${SIMD_FLAGS}")
-  target_compile_options(ipc_toolkit PRIVATE ${SIMD_FLAGS})
-else()
-  message(STATUS "SIMD support disabled")
+  target_compile_options(ipc_toolkit PRIVATE ${SIMD_CXX_FLAGS})
+
+  # Link against cross-platform xsimd library
+  include(xsimd)
+  target_link_libraries(ipc_toolkit PRIVATE xsimd::xsimd)
+
+  # Disable vectorization in Eigen since I've found it to cause issues.
+  target_compile_definitions(Eigen3_Eigen INTERFACE EIGEN_DONT_VECTORIZE)
 endif()
 
 # For MSVC, do not use the min and max macros.
diff --git a/cmake/recipes/xsimd.cmake b/cmake/recipes/xsimd.cmake
new file mode 100644
index 000000000..cf23784b8
--- /dev/null
+++ b/cmake/recipes/xsimd.cmake
@@ -0,0 +1,15 @@
+# xsimd (https://github.com/xtensor-stack/xsimd)
+# License: BSD-3-Clause
+if(TARGET xsimd::xsimd)
+  return()
+endif()
+
+message(STATUS "Third-party: creating target 'xsimd::xsimd'")
+
+include(CPM)
+CPMAddPackage("gh:xtensor-stack/xsimd#14.0.0")
+
+add_library(xsimd::xsimd ALIAS xsimd)
+
+# Folder name for IDE
+set_target_properties(xsimd PROPERTIES FOLDER "ThirdParty")
diff --git a/src/ipc/broad_phase/lbvh.cpp b/src/ipc/broad_phase/lbvh.cpp
index 0383ac9c0..12a1e3a9a 100644
--- a/src/ipc/broad_phase/lbvh.cpp
+++ b/src/ipc/broad_phase/lbvh.cpp
@@ -9,9 +9,10 @@
 #include <tbb/parallel_for.h>
 #include <tbb/parallel_sort.h>
 
-#ifdef __APPLE__
+#ifdef IPC_TOOLKIT_WITH_SIMD
 // We utilize SIMD registers to compare 1 Node against 4 Queries simultaneously.
-#include <simd/simd.h>
+#include <xsimd/xsimd.hpp>
+namespace xs = xsimd;
 #endif
 
 using namespace std::placeholders;
@@ -448,7 +449,7 @@ namespace {
         } while (node_idx != LBVH::Node::INVALID_POINTER); // Same as root
     }
 
-#ifdef __APPLE__
+#ifdef IPC_TOOLKIT_WITH_SIMD
     // SIMD Traversal
     // Traverses 4 queries simultaneously using SIMD.
     template <typename Candidate, bool swap_order, bool triangular>
@@ -461,8 +462,8 @@ namespace {
     {
         assert(n_queries >= 1 && n_queries <= 4);
         // Load 4 queries into single registers (Structure of Arrays)
-        auto make_simd = [&](auto F) -> simd_float4 {
-            return simd_float4 {
+        auto make_simd = [&](auto F) -> xs::batch<float> {
+            return xs::batch<float> {
                 F(0),
                 n_queries > 1 ? F(1) : 0.0f,
                 n_queries > 2 ? F(2) : 0.0f,
@@ -470,17 +471,17 @@ namespace {
             };
         };
 
-        const simd_float4 q_min_x =
+        const auto q_min_x =
             make_simd([&](int k) { return queries[k].aabb_min.x(); });
-        const simd_float4 q_min_y =
+        const auto q_min_y =
             make_simd([&](int k) { return queries[k].aabb_min.y(); });
-        const simd_float4 q_min_z =
+        const auto q_min_z =
             make_simd([&](int k) { return queries[k].aabb_min.z(); });
-        const simd_float4 q_max_x =
+        const auto q_max_x =
             make_simd([&](int k) { return queries[k].aabb_max.x(); });
-        const simd_float4 q_max_y =
+        const auto q_max_y =
             make_simd([&](int k) { return queries[k].aabb_max.y(); });
-        const simd_float4 q_max_z =
+        const auto q_max_z =
             make_simd([&](int k) { return queries[k].aabb_max.z(); });
 
         // Use a fixed-size array as a stack to avoid dynamic allocations
@@ -507,7 +508,8 @@ namespace {
 
             // 1. Intersect 4 queries at once
             // (child_l.min <= query.max) && (query.min <= child_l.max)
-            const simd_int4 intersects_l = (child_l.aabb_min.x() <= q_max_x)
+            const xs::batch_bool<float> intersects_l =
+                (child_l.aabb_min.x() <= q_max_x)
                 & (child_l.aabb_min.y() <= q_max_y)
                 & (child_l.aabb_min.z() <= q_max_z)
                 & (q_min_x <= child_l.aabb_max.x())
@@ -516,20 +518,21 @@ namespace {
 
             // 2. Intersect 4 queries at once
             // (child_r.min <= query.max) && (query.min <= child_r.max)
-            const simd_int4 intersects_r = (child_r.aabb_min.x() <= q_max_x)
+            const xs::batch_bool<float> intersects_r =
+                (child_r.aabb_min.x() <= q_max_x)
                 & (child_r.aabb_min.y() <= q_max_y)
                 & (child_r.aabb_min.z() <= q_max_z)
                 & (q_min_x <= child_r.aabb_max.x())
                 & (q_min_y <= child_r.aabb_max.y())
                 & (q_min_z <= child_r.aabb_max.z());
 
-            const bool any_intersects_l = simd_any(intersects_l);
-            const bool any_intersects_r = simd_any(intersects_r);
+            const bool any_intersects_l = xs::any(intersects_l);
+            const bool any_intersects_r = xs::any(intersects_r);
 
             // Query overlaps a leaf node => report collision
             if (any_intersects_l && child_l.is_leaf()) {
                 for (int k = 0; k < n_queries; ++k) {
-                    if (intersects_l[k]) {
+                    if (intersects_l.get(k)) {
                         attempt_add_candidate<
                             Candidate, swap_order, triangular>(
                             queries[k], child_l, can_collide, candidates);
@@ -538,7 +541,7 @@ namespace {
             }
             if (any_intersects_r && child_r.is_leaf()) {
                 for (int k = 0; k < n_queries; ++k) {
-                    if (intersects_r[k]) {
+                    if (intersects_r.get(k)) {
                         attempt_add_candidate<
                             Candidate, swap_order, triangular>(
                             queries[k], child_r, can_collide, candidates);
@@ -576,9 +579,12 @@ namespace {
         const std::function<bool(size_t, size_t)>& can_collide,
         tbb::enumerable_thread_specific<std::vector<Candidate>>& storage)
     {
-#ifdef __APPLE__ // Only support SIMD on Apple platforms for now
-        constexpr size_t SIMD_SIZE = use_simd ? 4 : 1;
-        constexpr size_t GRAIN_SIZE = use_simd ? 16 : 1;
+#ifdef IPC_TOOLKIT_WITH_SIMD // Only support SIMD on Apple platforms for now
+        constexpr size_t SIMD_SIZE = use_simd ? xs::batch<float>::size : 1;
+        static_assert(
+            64 % xs::batch<float>::size == 0, "GRAIN_SIZE must be an integer");
+        constexpr size_t GRAIN_SIZE =
+            use_simd ? (64 / xs::batch<float>::size) : 1;
 #else
         constexpr size_t SIMD_SIZE = 1;
         constexpr size_t GRAIN_SIZE = 1;
@@ -595,11 +601,13 @@ namespace {
             tbb::blocked_range<size_t>(size_t(0), n_tasks, GRAIN_SIZE),
             [&](const tbb::blocked_range<size_t>& r) {
                 auto& local_candidates = storage.local();
+#ifdef IPC_TOOLKIT_WITH_SIMD
                 const size_t actual_end = // Handle tail case
                     std::min(SIMD_SIZE * r.end(), n_source_leaves);
+#endif
                 for (size_t i = r.begin(); i < r.end(); ++i) {
                     const size_t idx = SIMD_SIZE * i;
-#ifdef __APPLE__
+#ifdef IPC_TOOLKIT_WITH_SIMD
                     if constexpr (use_simd) {
                         assert(actual_end - idx >= 1);
                         traverse_lbvh_simd<Candidate, swap_order, triangular>(
@@ -611,7 +619,7 @@ namespace {
                         traverse_lbvh<Candidate, swap_order, triangular>(
                             source[source_leaf_offset + idx], target,
                             can_collide, local_candidates);
-#ifdef __APPLE__
+#ifdef IPC_TOOLKIT_WITH_SIMD
                     }
 #endif
                 }
diff --git a/src/ipc/config.hpp.in b/src/ipc/config.hpp.in
index 6a0db7187..b47f3d9b8 100644
--- a/src/ipc/config.hpp.in
+++ b/src/ipc/config.hpp.in
@@ -13,6 +13,7 @@
 #cmakedefine IPC_TOOLKIT_WITH_INEXACT_CCD
 #cmakedefine IPC_TOOLKIT_WITH_RATIONAL_INTERSECTION
 #cmakedefine IPC_TOOLKIT_WITH_CUDA
+#cmakedefine IPC_TOOLKIT_WITH_SIMD
 #cmakedefine IPC_TOOLKIT_WITH_ROBIN_MAP
 #cmakedefine IPC_TOOLKIT_WITH_ABSEIL
 #cmakedefine IPC_TOOLKIT_WITH_FILIB

From ef469084be2200320b8f9e1d098e3cbc9a95616b Mon Sep 17 00:00:00 2001
From: Zachary Ferguson <zy.fergus@gmail.com>
Date: Mon, 26 Jan 2026 10:01:18 -0500
Subject: [PATCH 2/7] Generalize SIMD traversal to variable width

---
 src/ipc/broad_phase/lbvh.cpp | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/src/ipc/broad_phase/lbvh.cpp b/src/ipc/broad_phase/lbvh.cpp
index 12a1e3a9a..e81bb6cc9 100644
--- a/src/ipc/broad_phase/lbvh.cpp
+++ b/src/ipc/broad_phase/lbvh.cpp
@@ -15,6 +15,8 @@
 namespace xs = xsimd;
 #endif
 
+#include <array>
+
 using namespace std::placeholders;
 
 namespace ipc {
@@ -451,7 +453,7 @@ namespace {
 
 #ifdef IPC_TOOLKIT_WITH_SIMD
     // SIMD Traversal
-    // Traverses 4 queries simultaneously using SIMD.
+    // Traverses multiple queries simultaneously using SIMD.
     template <typename Candidate, bool swap_order, bool triangular>
     void traverse_lbvh_simd(
         const LBVH::Node* queries,
@@ -460,15 +462,24 @@ namespace {
         const std::function<bool(size_t, size_t)>& can_collide,
         std::vector<Candidate>& candidates)
     {
-        assert(n_queries >= 1 && n_queries <= 4);
-        // Load 4 queries into single registers (Structure of Arrays)
-        auto make_simd = [&](auto F) -> xs::batch<float> {
-            return xs::batch<float> {
-                F(0),
-                n_queries > 1 ? F(1) : 0.0f,
-                n_queries > 2 ? F(2) : 0.0f,
-                n_queries > 3 ? F(3) : 0.0f,
-            };
+        using batch_t = xs::batch<float>;
+        assert(n_queries >= 1 && n_queries <= batch_t::size);
+
+        // Load queries into single registers
+        auto make_simd = [&](auto F) -> batch_t {
+            // 1. Create a buffer of the correct architecture-dependent size
+            alignas(xs::default_arch::alignment())
+                std::array<float, batch_t::size>
+                    buffer;
+
+#pragma unroll
+            // 2. Fill the buffer, respecting the actual number of queries
+            for (size_t i = 0; i < batch_t::size; ++i) {
+                buffer[i] = (i < n_queries) ? F(static_cast<int>(i)) : 0.0f;
+            }
+
+            // 3. Load the buffer into the SIMD register
+            return batch_t::load_aligned(buffer.data());
         };
 
         const auto q_min_x =

From f98b593d060d79aec6d9ac175ded919f8f986b0d Mon Sep 17 00:00:00 2001
From: Zachary Ferguson <zach.ferguson@clo3d.com>
Date: Mon, 26 Jan 2026 11:51:34 -0500
Subject: [PATCH 3/7] Fix issue with abs(double) casting to int

---
 CMakeLists.txt              | 4 ++--
 src/ipc/utils/eigen_ext.hpp | 7 +++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3a4c23b54..3820456cb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -267,8 +267,8 @@ if(IPC_TOOLKIT_WITH_SIMD)
   include(xsimd)
   target_link_libraries(ipc_toolkit PRIVATE xsimd::xsimd)
 
-  # Disable vectorization in Eigen since I've found it to cause issues.
-  target_compile_definitions(Eigen3_Eigen INTERFACE EIGEN_DONT_VECTORIZE)
+  # Disable vectorization in Eigen since I've found it to have alignment issues.
+  target_compile_definitions(Eigen3_Eigen INTERFACE EIGEN_DONT_VECTORIZE=1)
 endif()
 
 # For MSVC, do not use the min and max macros.
diff --git a/src/ipc/utils/eigen_ext.hpp b/src/ipc/utils/eigen_ext.hpp
index ffdda82eb..1acc2471a 100644
--- a/src/ipc/utils/eigen_ext.hpp
+++ b/src/ipc/utils/eigen_ext.hpp
@@ -5,6 +5,13 @@
 
 #include <cassert>
 
+#ifdef EIGEN_DONT_VECTORIZE
+// NOTE: Avoid error about abs casting double to int. Eigen does this
+// internally but seemingly only if EIGEN_DONT_VECTORIZE is not defined.
+// TODO: We should always using std::abs to avoid this issue.
+using std::abs;
+#endif
+
 namespace Eigen {
 template <typename T> using RowRef = Ref<T, 0, Eigen::InnerStride<>>;
 template <typename T> using ConstRef = const Ref<const T>&;

From cd68104f8df0a937a670b5e509723baf685cb493 Mon Sep 17 00:00:00 2001
From: Zachary Ferguson <zy.fergus@gmail.com>
Date: Mon, 26 Jan 2026 12:30:11 -0500
Subject: [PATCH 4/7] Remove SIMD preset from CMake configuration and update
 lbvh.cpp buffer initialization

---
 CMakePresets.json              | 13 +------------
 IPCToolkitOptions.cmake.sample |  2 +-
 src/ipc/broad_phase/lbvh.cpp   |  2 +-
 3 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/CMakePresets.json b/CMakePresets.json
index fee266796..9c283ee56 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -51,16 +51,6 @@
         "IPC_TOOLKIT_WITH_CUDA": "ON"
       }
     },
-    {
-      "name": "simd",
-      "inherits": "release",
-      "displayName": "SIMD Enabled",
-      "description": "Build with SIMD optimizations",
-      "binaryDir": "${sourceDir}/build/simd",
-      "cacheVariables": {
-        "IPC_TOOLKIT_WITH_SIMD": "ON"
-      }
-    },
     {
       "name": "test",
       "inherits": "debug",
@@ -82,7 +72,6 @@
       "cacheVariables": {
         "IPC_TOOLKIT_BUILD_PYTHON": "ON",
         "IPC_TOOLKIT_BUILD_TESTS": "OFF",
-        "IPC_TOOLKIT_WITH_SIMD": "OFF",
         "IPC_TOOLKIT_WITH_CUDA": "OFF"
       }
     },
@@ -166,4 +155,4 @@
       }
     }
   ]
-}
+}
\ No newline at end of file
diff --git a/IPCToolkitOptions.cmake.sample b/IPCToolkitOptions.cmake.sample
index 8b0621631..6ebb4f7a3 100644
--- a/IPCToolkitOptions.cmake.sample
+++ b/IPCToolkitOptions.cmake.sample
@@ -31,12 +31,12 @@
 # option(IPC_TOOLKIT_BUILD_TESTS                "Build unit-tests"                               ON)
 # option(IPC_TOOLKIT_BUILD_PYTHON               "Build Python bindings"                         OFF)
 # option(IPC_TOOLKIT_WITH_CUDA                  "Enable CUDA CCD"                               OFF)
+# option(IPC_TOOLKIT_WITH_SIMD                  "Enable SIMD"                                    ON)
 # option(IPC_TOOLKIT_WITH_RATIONAL_INTERSECTION "Use rational edge-triangle intersection check" OFF)
 # option(IPC_TOOLKIT_WITH_ROBIN_MAP             "Use Tessil's robin-map rather than std maps"    ON)
 # option(IPC_TOOLKIT_WITH_ABSEIL                "Use Abseil's hash functions"                    ON)
 # option(IPC_TOOLKIT_WITH_FILIB                 "Use filib for interval arithmetic"              ON)
 # option(IPC_TOOLKIT_WITH_INEXACT_CCD           "Use the original inexact CCD method of IPC"    OFF)
-# option(IPC_TOOLKIT_WITH_SIMD                  "Enable SIMD"                                   OFF)
 # option(IPC_TOOLKIT_WITH_CODE_COVERAGE         "Enable coverage reporting"                     OFF)
 # option(IPC_TOOLKIT_TESTS_CCD_BENCHMARK        "Enable CCD benchmark test"                      ON)
 # set(IPC_TOOLKIT_TESTS_CCD_BENCHMARK_DIR     "" CACHE PATH "Path to the CCD benchmark directory")
diff --git a/src/ipc/broad_phase/lbvh.cpp b/src/ipc/broad_phase/lbvh.cpp
index e81bb6cc9..8d06128b6 100644
--- a/src/ipc/broad_phase/lbvh.cpp
+++ b/src/ipc/broad_phase/lbvh.cpp
@@ -470,7 +470,7 @@ namespace {
             // 1. Create a buffer of the correct architecture-dependent size
             alignas(xs::default_arch::alignment())
                 std::array<float, batch_t::size>
-                    buffer;
+                    buffer {};
 
 #pragma unroll
             // 2. Fill the buffer, respecting the actual number of queries

From f79d470577e60acad93d4cd9126b5f6905b39262 Mon Sep 17 00:00:00 2001
From: Zachary Ferguson <zy.fergus@gmail.com>
Date: Mon, 26 Jan 2026 12:38:35 -0500
Subject: [PATCH 5/7] Use EIGEN_USING_STD for std::abs to avoid casting issues

---
 src/ipc/utils/eigen_ext.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ipc/utils/eigen_ext.hpp b/src/ipc/utils/eigen_ext.hpp
index 1acc2471a..946398f63 100644
--- a/src/ipc/utils/eigen_ext.hpp
+++ b/src/ipc/utils/eigen_ext.hpp
@@ -9,7 +9,7 @@
 // NOTE: Avoid error about abs casting double to int. Eigen does this
 // internally but seemingly only if EIGEN_DONT_VECTORIZE is not defined.
 // TODO: We should always using std::abs to avoid this issue.
-using std::abs;
+EIGEN_USING_STD(abs); // using std::abs;
 #endif
 
 namespace Eigen {

From e5c3bf15c85ae0a1dee731b497a0d0a4cec45634 Mon Sep 17 00:00:00 2001
From: Zachary Ferguson <zy.fergus@gmail.com>
Date: Mon, 26 Jan 2026 13:40:46 -0500
Subject: [PATCH 6/7] Apply suggestions from code review

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/ipc/broad_phase/lbvh.cpp | 5 +++--
 src/ipc/utils/eigen_ext.hpp  | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/ipc/broad_phase/lbvh.cpp b/src/ipc/broad_phase/lbvh.cpp
index 8d06128b6..439856f6a 100644
--- a/src/ipc/broad_phase/lbvh.cpp
+++ b/src/ipc/broad_phase/lbvh.cpp
@@ -10,7 +10,8 @@
 #include <tbb/parallel_sort.h>
 
 #ifdef IPC_TOOLKIT_WITH_SIMD
-// We utilize SIMD registers to compare 1 Node against 4 Queries simultaneously.
+// We utilize SIMD registers to compare one node against multiple queries simultaneously,
+// with the number of queries determined by xs::batch<float>::size.
 #include <xsimd/xsimd.hpp>
 namespace xs = xsimd;
 #endif
@@ -590,7 +591,7 @@ namespace {
         const std::function<bool(size_t, size_t)>& can_collide,
         tbb::enumerable_thread_specific<std::vector<Candidate>>& storage)
     {
-#ifdef IPC_TOOLKIT_WITH_SIMD // Only support SIMD on Apple platforms for now
+#ifdef IPC_TOOLKIT_WITH_SIMD // Enable SIMD acceleration when available
         constexpr size_t SIMD_SIZE = use_simd ? xs::batch<float>::size : 1;
         static_assert(
             64 % xs::batch<float>::size == 0, "GRAIN_SIZE must be an integer");
diff --git a/src/ipc/utils/eigen_ext.hpp b/src/ipc/utils/eigen_ext.hpp
index 946398f63..a364bac0b 100644
--- a/src/ipc/utils/eigen_ext.hpp
+++ b/src/ipc/utils/eigen_ext.hpp
@@ -8,7 +8,7 @@
 #ifdef EIGEN_DONT_VECTORIZE
 // NOTE: Avoid error about abs casting double to int. Eigen does this
 // internally but seemingly only if EIGEN_DONT_VECTORIZE is not defined.
-// TODO: We should always using std::abs to avoid this issue.
+// TODO: We should always use std::abs to avoid this issue.
 EIGEN_USING_STD(abs); // using std::abs;
 #endif
 

From 52eb4cf25254fa18c92424cdbbb33af021c60662 Mon Sep 17 00:00:00 2001
From: Zachary Ferguson <zy.fergus@gmail.com>
Date: Mon, 26 Jan 2026 13:45:38 -0500
Subject: [PATCH 7/7] Refactor comments for clarity in SIMD query intersection
 logic

---
 src/ipc/broad_phase/lbvh.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/ipc/broad_phase/lbvh.cpp b/src/ipc/broad_phase/lbvh.cpp
index 439856f6a..7f3f5d311 100644
--- a/src/ipc/broad_phase/lbvh.cpp
+++ b/src/ipc/broad_phase/lbvh.cpp
@@ -10,8 +10,9 @@
 #include <tbb/parallel_sort.h>
 
 #ifdef IPC_TOOLKIT_WITH_SIMD
-// We utilize SIMD registers to compare one node against multiple queries simultaneously,
-// with the number of queries determined by xs::batch<float>::size.
+// We utilize SIMD registers to compare one node against multiple queries
+// simultaneously, with the number of queries determined by
+// xs::batch<float>::size.
 #include <xsimd/xsimd.hpp>
 namespace xs = xsimd;
 #endif
@@ -518,7 +519,7 @@ namespace {
             const LBVH::Node& child_l = lbvh[node.left];
             const LBVH::Node& child_r = lbvh[node.right];
 
-            // 1. Intersect 4 queries at once
+            // 1. Intersect multiple queries at once
             // (child_l.min <= query.max) && (query.min <= child_l.max)
             const xs::batch_bool<float> intersects_l =
                 (child_l.aabb_min.x() <= q_max_x)
@@ -528,7 +529,7 @@ namespace {
                 & (q_min_y <= child_l.aabb_max.y())
                 & (q_min_z <= child_l.aabb_max.z());
 
-            // 2. Intersect 4 queries at once
+            // 2. Intersect multiple queries at once
             // (child_r.min <= query.max) && (query.min <= child_r.max)
             const xs::batch_bool<float> intersects_r =
                 (child_r.aabb_min.x() <= q_max_x)