From b1d1eb2670d03dc3570e21089f4e6f52f04e9241 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Thu, 3 Jul 2025 16:51:22 +0200 Subject: [PATCH] Use xsimd::first in reduction implementation Provide specialization for all remaining architectures. --- .../xsimd/arch/common/xsimd_common_math.hpp | 2 +- include/xsimd/arch/xsimd_emulated.hpp | 7 ++ include/xsimd/arch/xsimd_rvv.hpp | 13 +++ include/xsimd/arch/xsimd_sse2.hpp | 102 +++++++++--------- include/xsimd/arch/xsimd_sve.hpp | 7 ++ include/xsimd/arch/xsimd_wasm.hpp | 36 +++++++ 6 files changed, 115 insertions(+), 52 deletions(-) diff --git a/include/xsimd/arch/common/xsimd_common_math.hpp b/include/xsimd/arch/common/xsimd_common_math.hpp index 9bc460922..689029aae 100644 --- a/include/xsimd/arch/common/xsimd_common_math.hpp +++ b/include/xsimd/arch/common/xsimd_common_math.hpp @@ -2117,7 +2117,7 @@ namespace xsimd template XSIMD_INLINE T reduce(Op, batch const& self, std::integral_constant) noexcept { - return self.get(0); + return ::xsimd::kernel::first(self, A {}); } template diff --git a/include/xsimd/arch/xsimd_emulated.hpp b/include/xsimd/arch/xsimd_emulated.hpp index 11e607479..bf169cd9a 100644 --- a/include/xsimd/arch/xsimd_emulated.hpp +++ b/include/xsimd/arch/xsimd_emulated.hpp @@ -230,6 +230,13 @@ namespace xsimd return r; } + // first + template ::size> + T XSIMD_INLINE first(batch const& self, requires_arch>) noexcept + { + return self.data[0]; + } + #if 0 // count template ::size> diff --git a/include/xsimd/arch/xsimd_rvv.hpp b/include/xsimd/arch/xsimd_rvv.hpp index ef00e7567..610c0944f 100644 --- a/include/xsimd/arch/xsimd_rvv.hpp +++ b/include/xsimd/arch/xsimd_rvv.hpp @@ -1335,6 +1335,19 @@ namespace xsimd return result; } + // first + template = 0> + XSIMD_INLINE T first(batch const& arg, requires_arch) noexcept + { + return detail::rvvmv_lane0(arg); + } + + template = 0> + XSIMD_INLINE std::complex first(batch, A> const& arg, requires_arch) noexcept + { + return std::complex { detail::rvvmv_lane0(arg.real()), detail::rvvmv_lane0(arg.imag()) }; + } + // insert template = 0> XSIMD_INLINE batch insert(batch const& arg, T val, index, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index 59a39363b..17bdd2bb2 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -673,6 +673,53 @@ namespace xsimd return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_castpd_si128(self), _mm_castpd_si128(other))); } + // first + template + XSIMD_INLINE float first(batch const& self, requires_arch) noexcept + { + return _mm_cvtss_f32(self); + } + + template + XSIMD_INLINE double first(batch const& self, requires_arch) noexcept + { + return _mm_cvtsd_f64(self); + } + + template ::value, void>::type> + XSIMD_INLINE T first(batch const& self, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return static_cast(_mm_cvtsi128_si32(self) & 0xFF); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return static_cast(_mm_cvtsi128_si32(self) & 0xFFFF); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return static_cast(_mm_cvtsi128_si32(self)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { +#if defined(__x86_64__) + return static_cast(_mm_cvtsi128_si64(self)); +#else + __m128i m; + _mm_storel_epi64(&m, self); + int64_t i; + std::memcpy(&i, &m, sizeof(i)); + return i; +#endif + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + // from_mask template XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept @@ -1269,10 +1316,10 @@ namespace xsimd batch step2 = _mm_shufflelo_epi16(acc1, mask2); batch acc2 = max(acc1, step2); if (sizeof(T) == 2) - return acc2.get(0); + return first(acc2, A {}); batch step3 = bitwise_cast(bitwise_cast(acc2) >> 8); batch acc3 = max(acc2, step3); - return acc3.get(0); + return first(acc3, A {}); } // reduce_min @@ -1291,10 +1338,10 @@ namespace xsimd batch step2 = _mm_shufflelo_epi16(acc1, mask2); batch acc2 = min(acc1, step2); if (sizeof(T) == 2) - return acc2.get(0); + return first(acc2, A {}); batch step3 = bitwise_cast(bitwise_cast(acc2) >> 8); batch acc3 = min(acc2, step3); - return acc3.get(0); + return first(acc3, A {}); } // rsqrt @@ -1783,53 +1830,6 @@ namespace xsimd return _mm_unpacklo_pd(self, other); } - // first - template - XSIMD_INLINE float first(batch const& self, requires_arch) noexcept - { - return _mm_cvtss_f32(self); - } - - template - XSIMD_INLINE double first(batch const& self, requires_arch) noexcept - { - return _mm_cvtsd_f64(self); - } - - template ::value, void>::type> - XSIMD_INLINE T first(batch const& self, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return static_cast(_mm_cvtsi128_si32(self) & 0xFF); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - return static_cast(_mm_cvtsi128_si32(self) & 0xFFFF); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) - { - return static_cast(_mm_cvtsi128_si32(self)); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) - { -#if defined(__x86_64__) - return static_cast(_mm_cvtsi128_si64(self)); -#else - __m128i m; - _mm_storel_epi64(&m, self); - int64_t i; - std::memcpy(&i, &m, sizeof(i)); - return i; -#endif - } - else - { - assert(false && "unsupported arch/op combination"); - return {}; - } - } - } } diff --git a/include/xsimd/arch/xsimd_sve.hpp b/include/xsimd/arch/xsimd_sve.hpp index 9958692a8..3fd08be1f 100644 --- a/include/xsimd/arch/xsimd_sve.hpp +++ b/include/xsimd/arch/xsimd_sve.hpp @@ -949,6 +949,13 @@ namespace xsimd return svsel(index_predicate, broadcast(val, sve {}), arg); } + // first + template = 0> + XSIMD_INLINE T first(batch const& self, requires_arch) noexcept + { + return self.data[0]; + } + // all template = 0> XSIMD_INLINE bool all(batch_bool const& arg, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_wasm.hpp b/include/xsimd/arch/xsimd_wasm.hpp index f4721aedd..29d9aed0a 100644 --- a/include/xsimd/arch/xsimd_wasm.hpp +++ b/include/xsimd/arch/xsimd_wasm.hpp @@ -531,6 +531,42 @@ namespace xsimd static_cast(wasm_f32x4_extract_lane(self, 3))); } } + // first + template + XSIMD_INLINE float first(batch const& self, requires_arch) noexcept + { + return wasm_f32x4_extract_lane(self, 0); + } + template ::value, void>::type> + XSIMD_INLINE T first(batch const& self, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_i8x16_extract_lane(self, 0); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_i16x8_extract_lane(self, 0); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return wasm_i32x4_extract_lane(self, 0); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return wasm_i64x2_extract_lane(self, 0); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + XSIMD_INLINE double first(batch const& self, requires_arch) noexcept + { + return wasm_f64x2_extract_lane(self, 0); + } // floor template