diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index a5fa3266d..7a2436d5a 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -1064,22 +1064,10 @@ namespace xsimd tmp = _mm256_hadd_ps(tmp, tmp); return _mm_cvtss_f32(_mm256_extractf128_ps(tmp, 0)); } - template - XSIMD_INLINE double reduce_add(batch const& rhs, requires_arch) noexcept - { - // rhs = (x0, x1, x2, x3) - // tmp = (x2, x3, x0, x1) - __m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1); - // tmp = (x2+x0, x3+x1, -, -) - tmp = _mm256_add_pd(rhs, tmp); - // tmp = (x2+x0+x3+x1, -, -, -) - tmp = _mm256_hadd_pd(tmp, tmp); - return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0)); - } - template ::value, void>::type> + template ::value || std::is_same::value, void>::type> XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept { - __m128i low, high; + typename batch::register_type low, high; detail::split_avx(self, low, high); batch blow(low), bhigh(high); return reduce_add(blow) + reduce_add(bhigh); diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index 79e29b7b0..d58724a61 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -1506,22 +1506,12 @@ namespace xsimd template XSIMD_INLINE float reduce_add(batch const& rhs, requires_arch) noexcept { - __m128 tmp1 = _mm512_extractf32x4_ps(rhs, 0); - __m128 tmp2 = _mm512_extractf32x4_ps(rhs, 1); - __m128 tmp3 = _mm512_extractf32x4_ps(rhs, 2); - __m128 tmp4 = _mm512_extractf32x4_ps(rhs, 3); - __m128 res1 = _mm_add_ps(tmp1, tmp2); - __m128 res2 = _mm_add_ps(tmp3, tmp4); - __m128 res3 = _mm_add_ps(res1, res2); - return reduce_add(batch(res3), sse4_2 {}); + return _mm512_reduce_add_ps(rhs); } template XSIMD_INLINE double reduce_add(batch const& rhs, requires_arch) noexcept { - __m256d tmp1 = _mm512_extractf64x4_pd(rhs, 1); - __m256d tmp2 = _mm512_extractf64x4_pd(rhs, 0); - __m256d res1 = _mm256_add_pd(tmp1, tmp2); - return reduce_add(batch(res1), avx2 {}); + return _mm512_reduce_add_pd(rhs); } template ::value, void>::type> XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_sse3.hpp b/include/xsimd/arch/xsimd_sse3.hpp index ffdc5bc9f..9dbc4b343 100644 --- a/include/xsimd/arch/xsimd_sse3.hpp +++ b/include/xsimd/arch/xsimd_sse3.hpp @@ -50,12 +50,6 @@ namespace xsimd __m128 tmp1 = _mm_hadd_ps(tmp0, tmp0); return _mm_cvtss_f32(tmp1); } - template - XSIMD_INLINE double reduce_add(batch const& self, requires_arch) noexcept - { - __m128d tmp0 = _mm_hadd_pd(self, self); - return _mm_cvtsd_f64(tmp0); - } }