diff --git a/include/xsimd/arch/xsimd_avx512bw.hpp b/include/xsimd/arch/xsimd_avx512bw.hpp index 0395853dc..b664ff582 100644 --- a/include/xsimd/arch/xsimd_avx512bw.hpp +++ b/include/xsimd/arch/xsimd_avx512bw.hpp @@ -486,33 +486,32 @@ namespace xsimd // slide_left namespace detail { - template - constexpr std::array make_slide_perm_hi(::xsimd::detail::index_sequence) + struct make_slide_perm_hi { - return { (Is == 0 ? 8 : Is - 1)... }; - } + static constexpr uint64_t get(size_t i, size_t) + { + return i == 0 ? 8 : i - 1; + } + }; - template - constexpr std::array make_slide_left_pattern(::xsimd::detail::index_sequence) - { - return { (Is >= N ? Is - N : 0)... }; - } - template - constexpr std::array make_slide_left_mask(::xsimd::detail::index_sequence) + template + struct make_slide_left_pattern { - return { (Is >= N ? 0xFFFF : 0x0000)... }; - } + static constexpr uint16_t get(size_t i, size_t) + { + return i >= N ? i - N : 0; + } + }; } template XSIMD_INLINE batch slide_left(batch const& x, requires_arch) noexcept { - constexpr unsigned BitCount = N * 8; - if (BitCount == 0) + if (N == 0) { return x; } - if (BitCount >= 512) + if (N >= 64) { return batch(T(0)); } @@ -526,10 +525,10 @@ namespace xsimd buffer[0] = buffer[0] << 8; xx = _mm512_load_epi64(&buffer[0]); - alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_hi(::xsimd::detail::make_index_sequence<512 / 64>()); + auto slide_perm = xsimd::make_batch_constant(); __m512i xl = _mm512_slli_epi64(x, 8); __m512i xr = _mm512_srli_epi64(x, 56); - xr = _mm512_permutex2var_epi64(xr, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512()); + xr = _mm512_permutex2var_epi64(xr, slide_perm.as_batch(), _mm512_setzero_si512()); xx = _mm512_or_si512(xr, xl); if (N == 1) return xx; @@ -538,50 +537,49 @@ namespace xsimd { xx = x; } - alignas(A::alignment()) auto slide_pattern = detail::make_slide_left_pattern(::xsimd::detail::make_index_sequence<512 / 16>()); - alignas(A::alignment()) auto slide_mask = detail::make_slide_left_mask(::xsimd::detail::make_index_sequence<512 / 16>()); - return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data())); + __mmask32 mask = 0xFFFFFFFFu << ((N / 2) & 31); + auto slide_pattern = xsimd::make_batch_constant, A>(); + return _mm512_maskz_permutexvar_epi16(mask, slide_pattern.as_batch(), xx); } // slide_right namespace detail { - template - constexpr std::array make_slide_perm_low(::xsimd::detail::index_sequence) + struct make_slide_perm_low { - return { (Is + 1)... }; - } + static constexpr uint64_t get(size_t i, size_t) + { + return i + 1; + } + }; - template - constexpr std::array make_slide_right_pattern(::xsimd::detail::index_sequence) - { - return { (Is < (32 - N) ? Is + N : 0)... }; - } - template - constexpr std::array make_slide_right_mask(::xsimd::detail::index_sequence) + template + struct make_slide_right_pattern { - return { (Is < 32 - N ? 0xFFFF : 0x0000)... }; - } + static constexpr uint16_t get(size_t i, size_t n) + { + return i < (n - N) ? i + N : 0; + } + }; } template XSIMD_INLINE batch slide_right(batch const& x, requires_arch) noexcept { - constexpr unsigned BitCount = N * 8; - if (BitCount == 0) + if (N == 0) { return x; } - if (BitCount >= 512) + if (N >= 64) { return batch(T(0)); } batch xx; if (N & 1) { - alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_low(::xsimd::detail::make_index_sequence<512 / 64>()); + auto slide_perm = xsimd::make_batch_constant(); __m512i xr = _mm512_srli_epi64(x, 8); __m512i xl = _mm512_slli_epi64(x, 56); - xl = _mm512_permutex2var_epi64(xl, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512()); + xl = _mm512_permutex2var_epi64(xl, slide_perm.as_batch(), _mm512_setzero_si512()); xx = _mm512_or_si512(xr, xl); if (N == 1) return xx; @@ -590,9 +588,9 @@ namespace xsimd { xx = x; } - alignas(A::alignment()) auto slide_pattern = detail::make_slide_right_pattern(::xsimd::detail::make_index_sequence<512 / 16>()); - alignas(A::alignment()) auto slide_mask = detail::make_slide_right_mask(::xsimd::detail::make_index_sequence<512 / 16>()); - return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data())); + __mmask32 mask = 0xFFFFFFFFu >> ((N / 2) & 31); + auto slide_pattern = xsimd::make_batch_constant, A>(); + return _mm512_maskz_permutexvar_epi16(mask, slide_pattern.as_batch(), xx); } // ssub diff --git a/include/xsimd/arch/xsimd_avx512vbmi.hpp b/include/xsimd/arch/xsimd_avx512vbmi.hpp index 680b23836..579f12309 100644 --- a/include/xsimd/arch/xsimd_avx512vbmi.hpp +++ b/include/xsimd/arch/xsimd_avx512vbmi.hpp @@ -26,17 +26,23 @@ namespace xsimd namespace detail { - template - constexpr std::array make_slide_left_bytes_pattern(::xsimd::detail::index_sequence) + template + struct make_slide_left_bytes_pattern { - return { (Is >= N ? Is - N : 0)... }; - } + static constexpr uint8_t get(size_t i, size_t) + { + return i >= N ? i - N : 0; + } + }; - template - constexpr std::array make_slide_right_bytes_pattern(::xsimd::detail::index_sequence) + template + struct make_slide_right_bytes_pattern { - return { (Is < (64 - N) ? Is + N : 0)... }; - } + static constexpr uint8_t get(size_t i, size_t n) + { + return i < (n - N) ? i + N : 0; + } + }; } // slide_left @@ -53,8 +59,8 @@ namespace xsimd } __mmask64 mask = 0xFFFFFFFFFFFFFFFFull << (N & 63); - alignas(A::alignment()) auto slide_pattern = detail::make_slide_left_bytes_pattern(::xsimd::detail::make_index_sequence<512 / 8>()); - return _mm512_maskz_permutexvar_epi8(mask, _mm512_load_epi32(slide_pattern.data()), x); + auto slide_pattern = xsimd::make_batch_constant, A>(); + return _mm512_maskz_permutexvar_epi8(mask, slide_pattern.as_batch(), x); } // slide_right @@ -70,8 +76,8 @@ namespace xsimd return batch(T(0)); } __mmask64 mask = 0xFFFFFFFFFFFFFFFFull >> (N & 63); - alignas(A::alignment()) auto slide_pattern = detail::make_slide_right_bytes_pattern(::xsimd::detail::make_index_sequence<512 / 8>()); - return _mm512_maskz_permutexvar_epi8(mask, _mm512_load_epi32(slide_pattern.data()), x); + auto slide_pattern = xsimd::make_batch_constant, A>(); + return _mm512_maskz_permutexvar_epi8(mask, slide_pattern.as_batch(), x); } // swizzle (dynamic version)