From b1d1eb2670d03dc3570e21089f4e6f52f04e9241 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Thu, 3 Jul 2025 16:51:22 +0200
Subject: [PATCH] Use xsimd::first in reduction implementation

Provide specialization for all remaining architectures.
---
 .../xsimd/arch/common/xsimd_common_math.hpp   |   2 +-
 include/xsimd/arch/xsimd_emulated.hpp         |   7 ++
 include/xsimd/arch/xsimd_rvv.hpp              |  13 +++
 include/xsimd/arch/xsimd_sse2.hpp             | 102 +++++++++---------
 include/xsimd/arch/xsimd_sve.hpp              |   7 ++
 include/xsimd/arch/xsimd_wasm.hpp             |  36 +++++++
 6 files changed, 115 insertions(+), 52 deletions(-)
diff --git a/include/xsimd/arch/common/xsimd_common_math.hpp b/include/xsimd/arch/common/xsimd_common_math.hpp
index 9bc460922..689029aae 100644
--- a/include/xsimd/arch/common/xsimd_common_math.hpp
+++ b/include/xsimd/arch/common/xsimd_common_math.hpp
@@ -2117,7 +2117,7 @@ namespace xsimd
             template <class Op, class A, class T>
             XSIMD_INLINE T reduce(Op, batch<T, A> const& self, std::integral_constant<unsigned, 1>) noexcept
             {
-                return self.get(0);
+                return ::xsimd::kernel::first(self, A {});
             }
 
             template <class Op, class A, class T, unsigned Lvl>
diff --git a/include/xsimd/arch/xsimd_emulated.hpp b/include/xsimd/arch/xsimd_emulated.hpp
index 11e607479..bf169cd9a 100644
--- a/include/xsimd/arch/xsimd_emulated.hpp
+++ b/include/xsimd/arch/xsimd_emulated.hpp
@@ -230,6 +230,13 @@ namespace xsimd
             return r;
         }
 
+        // first
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        T XSIMD_INLINE first(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return self.data[0];
+        }
+
 #if 0
         // count
         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
diff --git a/include/xsimd/arch/xsimd_rvv.hpp b/include/xsimd/arch/xsimd_rvv.hpp
index ef00e7567..610c0944f 100644
--- a/include/xsimd/arch/xsimd_rvv.hpp
+++ b/include/xsimd/arch/xsimd_rvv.hpp
@@ -1335,6 +1335,19 @@ namespace xsimd
             return result;
         }
 
+        // first
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE T first(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmv_lane0(arg);
+        }
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE std::complex<T> first(batch<std::complex<T>, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            return std::complex<T> { detail::rvvmv_lane0(arg.real()), detail::rvvmv_lane0(arg.imag()) };
+        }
+
         // insert
         template <class A, class T, size_t I, detail::rvv_enable_all_t<T> = 0>
         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& arg, T val, index<I>, requires_arch<rvv>) noexcept
diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp
index 59a39363b..17bdd2bb2 100644
--- a/include/xsimd/arch/xsimd_sse2.hpp
+++ b/include/xsimd/arch/xsimd_sse2.hpp
@@ -673,6 +673,53 @@ namespace xsimd
             return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_castpd_si128(self), _mm_castpd_si128(other)));
         }
 
+        // first
+        template <class A>
+        XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_cvtss_f32(self);
+        }
+
+        template <class A>
+        XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_cvtsd_f64(self);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return static_cast<T>(_mm_cvtsi128_si32(self) & 0xFF);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return static_cast<T>(_mm_cvtsi128_si32(self) & 0xFFFF);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return static_cast<T>(_mm_cvtsi128_si32(self));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+#if defined(__x86_64__)
+                return static_cast<T>(_mm_cvtsi128_si64(self));
+#else
+                __m128i m;
+                _mm_storel_epi64(&m, self);
+                int64_t i;
+                std::memcpy(&i, &m, sizeof(i));
+                return i;
+#endif
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
         // from_mask
         template <class A>
         XSIMD_INLINE batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
@@ -1269,10 +1316,10 @@ namespace xsimd
             batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
             batch<T, A> acc2 = max(acc1, step2);
             if (sizeof(T) == 2)
-                return acc2.get(0);
+                return first(acc2, A {});
             batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
             batch<T, A> acc3 = max(acc2, step3);
-            return acc3.get(0);
+            return first(acc3, A {});
         }
 
         // reduce_min
@@ -1291,10 +1338,10 @@ namespace xsimd
             batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
             batch<T, A> acc2 = min(acc1, step2);
             if (sizeof(T) == 2)
-                return acc2.get(0);
+                return first(acc2, A {});
             batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
             batch<T, A> acc3 = min(acc2, step3);
-            return acc3.get(0);
+            return first(acc3, A {});
         }
 
         // rsqrt
@@ -1783,53 +1830,6 @@ namespace xsimd
             return _mm_unpacklo_pd(self, other);
         }
 
-        // first
-        template <class A>
-        XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<sse2>) noexcept
-        {
-            return _mm_cvtss_f32(self);
-        }
-
-        template <class A>
-        XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<sse2>) noexcept
-        {
-            return _mm_cvtsd_f64(self);
-        }
-
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<sse2>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return static_cast<T>(_mm_cvtsi128_si32(self) & 0xFF);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return static_cast<T>(_mm_cvtsi128_si32(self) & 0xFFFF);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                return static_cast<T>(_mm_cvtsi128_si32(self));
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-#if defined(__x86_64__)
-                return static_cast<T>(_mm_cvtsi128_si64(self));
-#else
-                __m128i m;
-                _mm_storel_epi64(&m, self);
-                int64_t i;
-                std::memcpy(&i, &m, sizeof(i));
-                return i;
-#endif
-            }
-            else
-            {
-                assert(false && "unsupported arch/op combination");
-                return {};
-            }
-        }
-
     }
 }
 
diff --git a/include/xsimd/arch/xsimd_sve.hpp b/include/xsimd/arch/xsimd_sve.hpp
index 9958692a8..3fd08be1f 100644
--- a/include/xsimd/arch/xsimd_sve.hpp
+++ b/include/xsimd/arch/xsimd_sve.hpp
@@ -949,6 +949,13 @@ namespace xsimd
             return svsel(index_predicate, broadcast<A, T>(val, sve {}), arg);
         }
 
+        // first
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<sve>) noexcept
+        {
+            return self.data[0];
+        }
+
         // all
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
         XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
diff --git a/include/xsimd/arch/xsimd_wasm.hpp b/include/xsimd/arch/xsimd_wasm.hpp
index f4721aedd..29d9aed0a 100644
--- a/include/xsimd/arch/xsimd_wasm.hpp
+++ b/include/xsimd/arch/xsimd_wasm.hpp
@@ -531,6 +531,42 @@ namespace xsimd
                     static_cast<int32_t>(wasm_f32x4_extract_lane(self, 3)));
             }
         }
+        // first
+        template <class A>
+        XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_extract_lane(self, 0);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return wasm_i8x16_extract_lane(self, 0);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return wasm_i16x8_extract_lane(self, 0);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return wasm_i32x4_extract_lane(self, 0);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return wasm_i64x2_extract_lane(self, 0);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_extract_lane(self, 0);
+        }
 
         // floor
         template <class A>