diff --git a/include/xsimd/arch/xsimd_avx512bw.hpp b/include/xsimd/arch/xsimd_avx512bw.hpp index b664ff582..14858e90c 100644 --- a/include/xsimd/arch/xsimd_avx512bw.hpp +++ b/include/xsimd/arch/xsimd_avx512bw.hpp @@ -484,113 +484,25 @@ namespace xsimd } // slide_left - namespace detail - { - struct make_slide_perm_hi - { - static constexpr uint64_t get(size_t i, size_t) - { - return i == 0 ? 8 : i - 1; - } - }; - - template - struct make_slide_left_pattern - { - static constexpr uint16_t get(size_t i, size_t) - { - return i >= N ? i - N : 0; - } - }; - } - - template + template ::type> XSIMD_INLINE batch slide_left(batch const& x, requires_arch) noexcept { - if (N == 0) - { - return x; - } - if (N >= 64) - { - return batch(T(0)); - } - batch xx; - if (N & 1) - { - alignas(A::alignment()) uint64_t buffer[8]; - _mm512_store_epi64(&buffer[0], x); - for (int i = 7; i > 0; --i) - buffer[i] = (buffer[i] << 8) | (buffer[i - 1] >> 56); - buffer[0] = buffer[0] << 8; - xx = _mm512_load_epi64(&buffer[0]); + static_assert((N & 3) == 2 && N < 64, "The AVX512F implementation may have a lower latency."); - auto slide_perm = xsimd::make_batch_constant(); - __m512i xl = _mm512_slli_epi64(x, 8); - __m512i xr = _mm512_srli_epi64(x, 56); - xr = _mm512_permutex2var_epi64(xr, slide_perm.as_batch(), _mm512_setzero_si512()); - xx = _mm512_or_si512(xr, xl); - if (N == 1) - return xx; - } - else - { - xx = x; - } __mmask32 mask = 0xFFFFFFFFu << ((N / 2) & 31); - auto slide_pattern = xsimd::make_batch_constant, A>(); - return _mm512_maskz_permutexvar_epi16(mask, slide_pattern.as_batch(), xx); + auto slide_pattern = make_batch_constant, A>(); + return _mm512_maskz_permutexvar_epi16(mask, slide_pattern.as_batch(), x); } // slide_right - namespace detail - { - struct make_slide_perm_low - { - static constexpr uint64_t get(size_t i, size_t) - { - return i + 1; - } - }; - - template - struct make_slide_right_pattern - { - static constexpr uint16_t get(size_t i, size_t n) - { - return i < (n - N) ? i + N : 0; - } - }; - } - template + template ::type> XSIMD_INLINE batch slide_right(batch const& x, requires_arch) noexcept { - if (N == 0) - { - return x; - } - if (N >= 64) - { - return batch(T(0)); - } - batch xx; - if (N & 1) - { - auto slide_perm = xsimd::make_batch_constant(); - __m512i xr = _mm512_srli_epi64(x, 8); - __m512i xl = _mm512_slli_epi64(x, 56); - xl = _mm512_permutex2var_epi64(xl, slide_perm.as_batch(), _mm512_setzero_si512()); - xx = _mm512_or_si512(xr, xl); - if (N == 1) - return xx; - } - else - { - xx = x; - } + static_assert((N & 3) == 2 && N < 64, "The AVX512F implementation may have a lower latency."); + __mmask32 mask = 0xFFFFFFFFu >> ((N / 2) & 31); - auto slide_pattern = xsimd::make_batch_constant, A>(); - return _mm512_maskz_permutexvar_epi16(mask, slide_pattern.as_batch(), xx); + auto slide_pattern = make_batch_constant, A>(); + return _mm512_maskz_permutexvar_epi16(mask, slide_pattern.as_batch(), x); } // ssub diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index 5985ab388..5bc6672d9 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -1859,19 +1859,110 @@ namespace xsimd } // slide_left + namespace detail + { + template + struct make_slide_left_pattern + { + static constexpr size_t get(size_t i, size_t) + { + return i >= N ? i - N : 0; + } + }; + + template + XSIMD_INLINE batch slide_left_aligned_u32(batch const& x, requires_arch) noexcept + { + static_assert((N & 3) == 0 || N >= 64, "N must be aligned to 32 bits"); + + if (N == 0) + { + return x; + } + if (N >= 64) + { + return batch(T(0)); + } + + __mmask16 mask = uint16_t(0xFFFFu << (N / 4)); + + if ((N & 15) == 0) + { + const uint8_t imm8 = uint8_t(0xe4 << (2 * (N / 16))); + return _mm512_maskz_shuffle_i32x4(mask, x, x, imm8); + } + + auto slide_pattern = make_batch_constant, A>(); + return _mm512_maskz_permutexvar_epi32(mask, slide_pattern.as_batch(), x); + } + } + template - XSIMD_INLINE batch slide_left(batch const&, requires_arch) noexcept + XSIMD_INLINE batch slide_left(batch const& x, requires_arch) noexcept { - static_assert(N == 0xDEAD, "not implemented yet"); - return {}; + constexpr size_t NN = N & ~3; + if (N == NN || NN >= 64) + { + // Call fast path + return detail::slide_left_aligned_u32(x, A {}); + } + + __m512i xl = detail::slide_left_aligned_u32(_mm512_slli_epi32(x, 8 * (N - NN)), A {}); + __m512i xr = detail::slide_left_aligned_u32(_mm512_srli_epi32(x, 32 - 8 * (N - NN)), A {}); + return _mm512_or_epi32(xl, xr); } // slide_right + namespace detail + { + template + struct make_slide_right_pattern + { + static constexpr size_t get(size_t i, size_t n) + { + return i < (n - N) ? i + N : 0; + } + }; + + template + XSIMD_INLINE batch slide_right_aligned_u32(batch const& x, requires_arch) noexcept + { + static_assert((N & 3) == 0 || N >= 64, "N must be aligned to 32 bits"); + + if (N == 0) + { + return x; + } + if (N >= 64) + { + return batch(T(0)); + } + + __mmask16 mask = 0xFFFFu >> (N / 4); + + if ((N & 15) == 0) + { + const uint8_t imm8 = 0xe4 >> (2 * (N / 16)); + return _mm512_maskz_shuffle_i32x4(mask, x, x, imm8); + } + + auto slide_pattern = make_batch_constant, A>(); + return _mm512_maskz_permutexvar_epi32(mask, slide_pattern.as_batch(), x); + } + } template - XSIMD_INLINE batch slide_right(batch const&, requires_arch) noexcept + XSIMD_INLINE batch slide_right(batch const& x, requires_arch) noexcept { - static_assert(N == 0xDEAD, "not implemented yet"); - return {}; + constexpr size_t NN = N & ~3; + if (N == NN || NN >= 64) + { + // Call fast path + return detail::slide_right_aligned_u32(x, A {}); + } + + __m512i xl = detail::slide_right_aligned_u32(_mm512_slli_epi32(x, 32 - 8 * (N - NN)), A {}); + __m512i xr = detail::slide_right_aligned_u32(_mm512_srli_epi32(x, 8 * (N - NN)), A {}); + return _mm512_or_epi32(xl, xr); } // sqrt diff --git a/include/xsimd/arch/xsimd_avx512vbmi.hpp b/include/xsimd/arch/xsimd_avx512vbmi.hpp index 579f12309..7158ec839 100644 --- a/include/xsimd/arch/xsimd_avx512vbmi.hpp +++ b/include/xsimd/arch/xsimd_avx512vbmi.hpp @@ -24,59 +24,25 @@ namespace xsimd { using namespace types; - namespace detail - { - template - struct make_slide_left_bytes_pattern - { - static constexpr uint8_t get(size_t i, size_t) - { - return i >= N ? i - N : 0; - } - }; - - template - struct make_slide_right_bytes_pattern - { - static constexpr uint8_t get(size_t i, size_t n) - { - return i < (n - N) ? i + N : 0; - } - }; - } - // slide_left - template + template ::type> XSIMD_INLINE batch slide_left(batch const& x, requires_arch) noexcept { - if (N == 0) - { - return x; - } - if (N >= 64) - { - return batch(T(0)); - } + static_assert((N & 3) != 0 && N < 64, "The AVX512F implementation may have a lower latency."); __mmask64 mask = 0xFFFFFFFFFFFFFFFFull << (N & 63); - auto slide_pattern = xsimd::make_batch_constant, A>(); + auto slide_pattern = make_batch_constant, A>(); return _mm512_maskz_permutexvar_epi8(mask, slide_pattern.as_batch(), x); } // slide_right - template + template ::type> XSIMD_INLINE batch slide_right(batch const& x, requires_arch) noexcept { - if (N == 0) - { - return x; - } - if (N >= 64) - { - return batch(T(0)); - } + static_assert((N & 3) != 0 && N < 64, "The AVX512F implementation may have a lower latency."); + __mmask64 mask = 0xFFFFFFFFFFFFFFFFull >> (N & 63); - auto slide_pattern = xsimd::make_batch_constant, A>(); + auto slide_pattern = make_batch_constant, A>(); return _mm512_maskz_permutexvar_epi8(mask, slide_pattern.as_batch(), x); } diff --git a/test/test_shuffle.cpp b/test/test_shuffle.cpp index bc89aafd5..846da5b7d 100644 --- a/test/test_shuffle.cpp +++ b/test/test_shuffle.cpp @@ -166,7 +166,6 @@ namespace }; } -#if !XSIMD_WITH_AVX512F || XSIMD_WITH_AVX512BW template struct slide_test : public init_slide_base { @@ -270,8 +269,6 @@ TEST_CASE_TEMPLATE("[slide]", B, BATCH_INT_TYPES) } } -#endif - template struct compress_test {