diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index ab24b4868..193038e75 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -17,6 +17,7 @@ jobs: - { compiler: 'gcc', version: '13', flags: 'enable_xtl_complex' } - { compiler: 'gcc', version: '14', flags: 'avx' } - { compiler: 'gcc', version: '13', flags: 'avx512' } + - { compiler: 'gcc', version: '10', flags: 'avx512' } - { compiler: 'gcc', version: '12', flags: 'i386' } - { compiler: 'gcc', version: '13', flags: 'avx512pf' } - { compiler: 'gcc', version: '13', flags: 'avx512vbmi' } diff --git a/include/xsimd/arch/common/xsimd_common_swizzle.hpp b/include/xsimd/arch/common/xsimd_common_swizzle.hpp index 5f51aaf9b..f9cfa0865 100644 --- a/include/xsimd/arch/common/xsimd_common_swizzle.hpp +++ b/include/xsimd/arch/common/xsimd_common_swizzle.hpp @@ -167,6 +167,49 @@ namespace xsimd return cross_impl<0, sizeof...(Vs), sizeof...(Vs) / 2, Vs...>::value; } + // 128-bit lane aware cross_impl: checks per 128-bit lane + template + struct cross_impl128 + { + static constexpr std::size_t Vi = static_cast(get_at::value); + static constexpr bool curr = ((I / LaneElems) != (static_cast(Vi) / LaneElems)); + static constexpr bool next = cross_impl128::value; + static constexpr bool value = curr || next; + }; + template + struct cross_impl128 + { + static constexpr bool value = false; + }; + + template + XSIMD_INLINE constexpr bool is_cross_lane_128() noexcept + { + static_assert(sizeof...(Vs) >= 1, "Need at least one lane"); + constexpr std::size_t N = sizeof...(Vs); + constexpr std::size_t lane_elems = 16 / sizeof(ElemT); + return cross_impl128<0, N, lane_elems, U, Vs...>::value; + } + + // overload accepting an element type first to compute 128-bit lane size + template + XSIMD_INLINE constexpr bool is_cross_lane() noexcept + { + static_assert(std::is_integral::value, "swizzle mask values must be integral"); + return is_cross_lane_128(); + } + + // convenience overload taking element type then integer non-type parameter pack + template + XSIMD_INLINE constexpr bool is_cross_lane() noexcept + { + return is_cross_lane_128(); + } + template XSIMD_INLINE constexpr bool is_identity() noexcept { return detail::identity_impl<0, T, Vs...>(); } template @@ -184,7 +227,11 @@ namespace xsimd template XSIMD_INLINE constexpr bool is_only_from_hi(batch_constant) noexcept { return detail::is_only_from_hi(); } template - XSIMD_INLINE constexpr bool is_cross_lane(batch_constant) noexcept { return detail::is_cross_lane(); } + XSIMD_INLINE constexpr bool is_cross_lane(batch_constant) noexcept + { + static_assert(std::is_integral::value, "swizzle mask values must be integral"); + return is_cross_lane_128(); + } } // namespace detail } // namespace kernel diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index b9ec7916b..5ccf165f1 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -2737,15 +2737,15 @@ namespace xsimd { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { - return static_cast(_mm512_cvtsi512_si32(self) & 0xFF); + return static_cast(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFF); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { - return static_cast(_mm512_cvtsi512_si32(self) & 0xFFFF); + return static_cast(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFFFF); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { - return static_cast(_mm512_cvtsi512_si32(self)); + return static_cast(_mm_cvtsi128_si32(_mm512_castsi512_si128(self))); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { diff --git a/test/test_batch_manip.cpp b/test/test_batch_manip.cpp index 7da46e736..1f40726fb 100644 --- a/test/test_batch_manip.cpp +++ b/test/test_batch_manip.cpp @@ -52,11 +52,24 @@ namespace xsimd static_assert(is_dup_hi(), "4-lane dup_hi failed"); static_assert(!is_dup_lo(), "4-lane dup_lo on dup_hi"); - static_assert(is_cross_lane<0, 1, 0, 1>(), "dup-lo only → crossing"); - static_assert(is_cross_lane<2, 3, 2, 3>(), "dup-hi only → crossing"); - static_assert(is_cross_lane<0, 3, 3, 3>(), "one low + rest high → crossing"); - static_assert(!is_cross_lane<1, 0, 2, 3>(), "mixed low/high → no crossing"); - static_assert(!is_cross_lane<0, 1, 2, 3>(), "mixed low/high → no crossing"); + static_assert(is_cross_lane(), "dup-lo only → crossing"); + static_assert(is_cross_lane(), "dup-hi only → crossing"); + static_assert(is_cross_lane(), "one low + rest high → crossing"); + static_assert(!is_cross_lane(), "mixed low/high → no crossing"); + static_assert(!is_cross_lane(), "mixed low/high → no crossing"); + // 8-lane 128-bit lane checks (use double/int64 for 2-elements-per-128-bit lanes) + static_assert(is_cross_lane(), "8-lane 128-bit swap → crossing"); + static_assert(!is_cross_lane(), "identity 8-lane → no crossing"); + static_assert(is_cross_lane(), "8-lane uint64_t swap → crossing"); + static_assert(is_cross_lane(), "8-lane int32_t swap → crossing"); + + // Additional compile-time checks for 16-element batches (e.g. float/int32) + static_assert(is_cross_lane(), + "16-lane 128-bit swap → crossing"); + static_assert(!is_cross_lane(), + "identity 16-lane → no crossing"); + static_assert(is_cross_lane(), + "16-lane uint32_t swap → crossing"); } } } diff --git a/test/test_shuffle.cpp b/test/test_shuffle.cpp index b08210974..898f7e519 100644 --- a/test/test_shuffle.cpp +++ b/test/test_shuffle.cpp @@ -672,10 +672,15 @@ struct shuffle_test } }; +#if defined(__GNUC__) && (__GNUC__ == 10) && XSIMD_WITH_AVX512F + // Use zip_lo as a stable reference for the expected interleave. + B b_ref_lo = xsimd::zip_lo(b_lhs, b_rhs); +#else std::array ref_lo; for (size_t i = 0; i < size; ++i) ref_lo[i] = (i & 1) ? rhs[i / 2] : lhs[i / 2]; B b_ref_lo = B::load_unaligned(ref_lo.data()); +#endif INFO("zip_lo"); B b_res_lo = xsimd::shuffle(b_lhs, b_rhs, xsimd::make_batch_constant()); @@ -689,12 +694,17 @@ struct shuffle_test } }; +#if defined(__GNUC__) && (__GNUC__ == 10) && XSIMD_WITH_AVX512F + // Use zip_hi as a stable reference for the expected interleave. + B b_ref_hi = xsimd::zip_hi(b_lhs, b_rhs); +#else std::array ref_hi; for (size_t i = 0; i < size; ++i) { ref_hi[i] = (i & 1) ? rhs[size / 2 + i / 2] : lhs[size / 2 + i / 2]; } B b_ref_hi = B::load_unaligned(ref_hi.data()); +#endif INFO("zip_hi"); B b_res_hi = xsimd::shuffle(b_lhs, b_rhs, xsimd::make_batch_constant());